Code Monkey home page Code Monkey logo

adam-gfa's Introduction

adam-gfa

Graphical Fragment Assembly (GFA) support for ADAM.

Build Status Maven Central API Documentation

Hacking adam-gfa

Install

To build

$ mvn install

Running adam-gfa

Transform GFA 1.0 to generic Gfa1Record records in Parquet format

$ spark-submit \
    --class com.github.heuermh.adam.gfa.Gfa1ToDataframe \
    target/adam-gfa_2.12-${version}.jar \
    in.gfa \
    out.parquet

Transform GFA 1.0 to specific Containment, Link, Path, Segment, and Traversal records in Parquet format

$ spark-submit \
    --class com.github.heuermh.adam.gfa.Gfa1ToDataframes \
    target/adam-gfa_2.12-${version}.jar \
    in.gfa \
    out

(creates separate out.containments.parquet, out.links.parquet, out.paths.parquet, out.segments.parquet, and out.traversals.parquet directories)

Graphical Fragment Assembly (GFA) version 1.0 schema in Parquet format

Gfa1Record

Gfa1Record (scaladoc)

message spark_schema {
  optional binary recordType (STRING);
  optional binary name (STRING);
  optional binary sequence (STRING);
  optional int32 length;
  optional int32 readCount;
  optional int32 fragmentCount;
  optional int32 kmerCount;
  optional binary sequenceChecksum (STRING);
  optional binary sequenceUri (STRING);
  optional binary stableName (STRING);
  optional int32 stableOffset;
  optional int32 stableRank;
  optional binary id (STRING);
  optional group source {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group target {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional binary overlap (STRING);
  optional int32 mappingQuality;
  optional int32 mismatchCount;
  optional binary pathName (STRING);
  optional group segments (LIST) {
    repeated group list {
      optional group element {
        optional binary id (STRING);
        optional binary orientation (STRING);
      }
    }
  }
  optional group overlaps (LIST) {
    repeated group list {
      optional binary element (STRING);
    }
  }
  optional int32 ordinal;
  optional group container {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group contained {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional int32 position;
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Containment

Link (scaladoc)

message spark_schema {
  optional binary id (STRING);
  optional group container {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group contained {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional int32 position;
  optional binary overlap (STRING);
  optional int32 mismatchCount;
  optional int32 readCount;
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Link

Link (scaladoc)

message spark_schema {
  optional binary id (STRING);
  optional group source {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group target {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional binary overlap (STRING);
  optional int32 mappingQuality;
  optional int32 mismatchCount;
  optional int32 readCount;
  optional int32 fragmentCount;
  optional int32 kmerCount;
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Path

Path (scaladoc)

message spark_schema {
  optional binary pathName (STRING);
  optional group segments (LIST) {
    repeated group list {
      optional group element {
        optional binary id (STRING);
        optional binary orientation (STRING);
      }
    }
  }
  optional group overlaps (LIST) {
    repeated group list {
      optional binary element (STRING);
    }
  }
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Segment

Segment (scaladoc)

message spark_schema {
  optional binary name (STRING);
  optional binary sequence (STRING);
  optional int32 length;
  optional int32 readCount;
  optional int32 fragmentCount;
  optional int32 kmerCount;
  optional binary sequenceChecksum (STRING);
  optional binary sequenceUri (STRING);
  optional binary stableName (STRING);
  optional int32 stableOffset;
  optional int32 stableRank;
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Traversal

Traversal (scaladoc)

message spark_schema {
  optional binary id (STRING);
  optional binary pathName (STRING);
  optional int32 ordinal;
  optional group source {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group target {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional binary overlap (STRING);
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

adam-gfa's People

Contributors

dependabot[bot] avatar heuermh avatar

Stargazers

 avatar  avatar

Watchers

 avatar  avatar  avatar

adam-gfa's Issues

NullPointerException when writing GFA 1.0 + traversals to dataframe

$ spark-submit \
  --driver-memory 30G \
  --class com.github.heuermh.adam.gfa.Gfa1ToDataframe \
  target/adam-gfa_2.11-0.4.0-SNAPSHOT.jar \
  human__pan.AF0__18.traversals.gfa \
  human__pan.AF0__18.traversals.snappy.parquet

...
java.lang.NullPointerException
	at scala.collection.convert.Wrappers$JListWrapper.length(Wrappers.scala:86)
	at scala.collection.SeqLike$class.size(SeqLike.scala:106)
	at scala.collection.AbstractSeq.size(Seq.scala:41)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.serializefromobject_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:244)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:242)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:248)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.