Skip to content

Commit

Permalink
fix: score transformed to array and renamed value
Browse files Browse the repository at this point in the history
  • Loading branch information
mkarmona committed Jun 26, 2018
1 parent 9a617ce commit 5fddfb8
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions src/main/scala/ot/geckopipe/interval/Interval.scala
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ object Interval extends LazyLogging {
StructField("position_start", LongType) ::
StructField("position_end", LongType) ::
StructField("gene_id", StringType) ::
StructField("value", DoubleType) ::
StructField("score", DoubleType) ::
StructField("feature", StringType) :: Nil)

def load(from: String)(implicit ss: SparkSession): DataFrame = {
Expand Down Expand Up @@ -41,12 +41,13 @@ object Interval extends LazyLogging {

logger.info("generate pchic dataset from file and aggregating by range and gene")
val interval = load(conf.interval.path)
.withColumn("value", array(col("score")))
.withColumn("tokens", extractValidTokensFromPath(col("filename")))
.withColumn("source_id", col("tokens").getItem(0))
.withColumn("tissue_id", col("tokens").getItem(1))
.drop("filename", "tokens")
.withColumn("position", explode(fromRangeToArray(col("position_start"), col("position_end"))))
.drop("position_start", "position_end")
.drop("position_start", "position_end", "score")
.repartitionByRange(col("chr_id").asc, col("position").asc)

interval.join(vIdx.table, Seq("chr_id", "position"))
Expand Down

0 comments on commit 5fddfb8

Please sign in to comment.