1 module gff3d.gff3record; 2 3 import std.stdio; 4 import std.algorithm.iteration: splitter; 5 import std.stdio: chunks, File; 6 import std.algorithm: filter,map; 7 import std.traits: ReturnType; 8 import std.array: replace; 9 import std.conv: to; 10 import std.string; 11 import std.file: mkdir; 12 import std.algorithm.searching: countUntil; 13 import std.range: drop, ForwardRange, chain; 14 import std.format : format; 15 16 import dhtslib.coordinates; 17 18 //import dhtslib.htslib.hts_log; 19 20 /** GFF3 Record 21 22 Format Documentation: 23 * http://gmod.org/wiki/GFF3 24 * https://useast.ensembl.org/info/website/upload/gff3.html 25 * https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md 26 * http://www.sequenceontology.org/gff3.shtml 27 * 28 * TODO: make sortable 29 * TODO: Make record builder (i.e. start with blank record and add attrs) to prep for writing 30 */ 31 struct GFF3_Record 32 { 33 private ubyte[] raw; 34 35 /// constructor (raw ubytes) 36 this(ubyte[]data){ 37 this.raw = data; 38 } 39 /// constructor (string) 40 this(string data){ 41 this.raw = cast(ubyte[])data; 42 } 43 44 /// TODO: Not implemented; (almost) always true 45 @property bool isValid() 46 { 47 //hts_log_trace(__FUNCTION__, format("raw.length %d", raw.length)); 48 return (raw.length >= 0 ? true : false); 49 } 50 51 /// Column 1: seqid (aka contig); basis for the coordinate system 52 @property seqid() const { return cast(string)this.raw.splitter('\t').front; } 53 /// ditto 54 @property contig() const{ return cast(string)this.raw.splitter('\t').front; } 55 56 /// Column 2: source; software, procedure, or database originating the record 57 @property source() const { return cast(string)this.raw.splitter('\t').drop(1).front; } 58 59 /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number 60 @property type() const { return cast(string)this.raw.splitter('\t').drop(2).front; } 61 62 /// Columns 4 & 5: returns Coordinate set: Obc format 63 @property coordinates() const 64 { 65 auto start = (cast(string)this.raw.splitter('\t').drop(3).front).to!long; 66 auto end = (cast(string)this.raw.splitter('\t').drop(4).front).to!long; 67 return Obc(start, end); 68 } 69 /// Columns 4: start; 1-based integer start position of the feature 70 @property start() const { return this.coordinates.start; } 71 /// Column 5: end; closed coordinate integer ending nucleotide position of the feature 72 @property end() const { return this.coordinates.end; } 73 74 /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined." 75 /// Tragically, score can be either a float, or not present (".") 76 /// Totally arbitrarily, we will represent absent as -1 77 @property score() const { 78 if(cast(string)this.raw.splitter('\t').drop(5).front=="."){ 79 return -1.0; 80 } 81 return (cast(string)this.raw.splitter('\t').drop(5).front).to!float; 82 } 83 84 /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown) 85 @property strand() const { 86 return cast(char)this.raw.splitter('\t').drop(6).front[0]; 87 } 88 89 /** Column 8: phase; 90 For features of type "CDS", the phase indicates where the feature begins with 91 reference to the reading frame. The phase is one of the integers 0, 1, or 2, 92 indicating the number of bases that should be removed from the beginning of 93 this feature to reach the first base of the next codon. In other words, a 94 phase of "0" indicates that the next codon begins at the first base of the 95 region described by the current line, a phase of "1" indicates that the next 96 codon begins at the second base of this region, and a phase of "2" indicates 97 that the codon begins at the third base of this region. This is NOT to be 98 confused with the frame, which is simply start modulo 3. 99 100 For forward strand features, phase is counted from the start field. 101 For reverse strand features, phase is counted from the end field. 102 103 The phase is REQUIRED for all CDS features. 104 105 Tragically, phase can be either an integer (0, 1, 2), or not present (".") 106 Totally arbitrarily, we will represent absent as -1 107 **/ 108 @property phase() const { 109 if(cast(string)this.raw.splitter('\t').drop(7).front=="."){ 110 return -1; 111 } 112 return (cast(string)this.raw.splitter('\t').drop(7).front).to!int; 113 } 114 115 /// Column 9: attributes; A list of ;-separated feature attributes in key=value form 116 string attributes(const string field) const { return this.opIndex(field); } 117 118 /// Provides map key lookup semantics for column 9 attributes 119 string opIndex(string field) const { 120 const attrs=this.raw.splitter('\t').drop(8).front.splitter(";"); 121 auto val = attrs // actualy a Range of key=val 122 .filter!(kv => cast(string)(kv[0 .. kv.countUntil('=')]) == field); 123 //.front; // -- AssertError if range is empty 124 if (!val.empty) return cast(string) (val.front[val.front.countUntil('=')+1..$]); 125 else return ""; 126 127 /+ Alternative impl -- benchmark (also pull field ~ "=" out of the filter and combine it once upfront) 128 auto vals = attrs 129 .filter!(kv => kv.startsWith(field ~ "=")) 130 .map!(kv => kv[kv.countUntil('=')+1 .. $]); 131 if (!vals.empty) return cast(string) val.front; 132 else return ""; 133 +/ 134 } 135 136 /// Column 9 attributes may also include a comma-sep list of tags: (key:tag)={t1,t2,t3,...} 137 bool hasTag(string tagName)() 138 { 139 return this["tag"].splitter(",").filter!(t => t == tagName).count > 0; 140 } 141 142 /// Computed feature length 143 @property length() const { return this.end - (this.start-1); } 144 /// Relative start === 1 145 @property relativeStart() const { return Ob(1); } 146 /// Relative start === the feature length 147 @property relativeEnd() const { return Ob(this.length); } 148 149 /// Genomic coordinate at offset into feature, taking strandedness into account 150 @property coordinateAtOffset(long offset) const 151 { 152 // GFF3 features are 1-based coordinates 153 assert(offset > 0); 154 offset--; // necessary in a 1-based closed coordinate system 155 156 // for - strand count from end; for +, ., and ? strand count from start 157 immutable auto begin = (this.strand == '-' ? this.end : this.start); 158 159 // for - strand count backward; for +, ., and ? strand count forward 160 immutable auto direction = (this.strand == '-' ? -1 : 1); 161 162 return Ob(begin + (direction * offset)); 163 } 164 /// Genomic coordinate at beginning of feature, taking strandedness into account 165 @property coordinateAtBegin() const 166 { 167 return this.coordinateAtOffset(1); 168 } 169 /// Genomic coordinate at end of feature, taking strandedness into account 170 @property coordinateAtEnd() const 171 { 172 return this.coordinateAtOffset(this.length); 173 } 174 175 string toString() const { 176 return cast(string) this.raw; 177 } 178 /// Returns a string with the canonical "chr:start-end" representation 179 @property string canonicalRepresentation() const { 180 return this.seqid~":"~this.start.pos.to!string~"-"~this.end.pos.to!string; 181 } 182 /// Return the seqURI representation 183 @property string seqURI() const { 184 return format("seq:unk/%s", this.canonicalRepresentation); 185 } 186 187 } 188 unittest{ 189 auto rec = GFF3_Record("chr1\tHAVANA\tgene\t11869\t14409\t.\t+\t.\tID=ENSG00000223972.5;gene_id=ENSG00000223972.5;gene_id=ENSG00000223972.5;gene_type=transcribed_unprocessed_pseudogene;gene_name=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2"); // @suppress(dscanner.style.long_line) 190 auto rec_neg= GFF3_Record("chr1\tHAVANA\tgene\t11869\t14409\t.\t-\t.\tID=ENSG00000223972.5;gene_id=ENSG00000223972.5;gene_id=ENSG00000223972.5;gene_type=transcribed_unprocessed_pseudogene;gene_name=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2"); // @suppress(dscanner.style.long_line) 191 192 assert(rec.seqid=="chr1"); 193 assert(rec.source=="HAVANA"); 194 assert(rec.type=="gene"); 195 assert(rec.start==11_869); 196 assert(rec.end==14_409); 197 assert(rec.score==-1.0); 198 assert(rec.strand()=='+'); 199 assert(rec.phase==-1); 200 assert(rec["ID"] == "ENSG00000223972.5"); 201 assert(rec["gene_id"] == "ENSG00000223972.5"); 202 assert(rec["gene_type"] == "transcribed_unprocessed_pseudogene"); 203 assert(rec["gene_name"] == "DDX11L1"); 204 assert(rec["level"] == "2"); 205 assert(rec["havana_gene"] == "OTTHUMG00000000961.2"); 206 207 assert(rec.length == 2541); 208 assert(rec.relativeStart == 1); 209 assert(rec.relativeEnd == 2541); 210 211 // Test forward and backward offsets 212 assert(rec.coordinateAtOffset(2) == 11_870); 213 assert(rec_neg.coordinateAtOffset(2) == 14_408); 214 215 assert(rec.coordinateAtBegin == 11_869); 216 assert(rec.coordinateAtEnd == 14_409); 217 218 assert(rec_neg.coordinateAtBegin == 14_409); 219 assert(rec_neg.coordinateAtEnd == 11_869); 220 221 // TODO validator 222 assert(rec.isValid); 223 }