1 module gff3d.gtfrecord; 2 3 import std.stdio; 4 import std.algorithm.iteration: splitter; 5 import std.stdio: chunks, File; 6 import std.algorithm: filter,map; 7 import std.traits: ReturnType; 8 import std.array: replace; 9 import std.conv: to; 10 import std.string; 11 import std.file: mkdir; 12 import std.algorithm.searching: countUntil; 13 import std.range: drop, ForwardRange, chain; 14 import std.format : format; 15 16 import dhtslib.coordinates; 17 18 //import dhtslib.htslib.hts_log; 19 20 /** GTF Record 21 22 Format Documentation: 23 * http://gmod.org/wiki/GFF2#The_GFF2_File_Format 24 * https://useast.ensembl.org/info/website/upload/gff.html 25 * 26 * TODO: make sortable 27 * TODO: Make record builder (i.e. start with blank record and add attrs) to prep for writing 28 */ 29 struct GTF_Record 30 { 31 private ubyte[] raw; 32 33 /// constructor (raw ubytes) 34 this(ubyte[]data){ 35 this.raw = data; 36 } 37 /// constructor (string) 38 this(string data){ 39 this.raw = cast(ubyte[])data; 40 } 41 42 /// TODO: Not implemented; (almost) always true 43 @property bool isValid() 44 { 45 //hts_log_trace(__FUNCTION__, format("raw.length %d", raw.length)); 46 return (raw.length >= 0 ? true : false); 47 } 48 49 /// Column 1: seqid (aka contig); basis for the coordinate system 50 @property seqid() const { return cast(string)this.raw.splitter('\t').front; } 51 /// ditto 52 @property contig() const{ return cast(string)this.raw.splitter('\t').front; } 53 54 /// Column 2: source; software, procedure, or database originating the record 55 @property source() const { return cast(string)this.raw.splitter('\t').drop(1).front; } 56 57 /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number 58 @property type() const { return cast(string)this.raw.splitter('\t').drop(2).front; } 59 60 /// Columns 4 & 5: returns Coordinate set: Obc format 61 @property coordinates() const 62 { 63 auto start = (cast(string)this.raw.splitter('\t').drop(3).front).to!long; 64 auto end = (cast(string)this.raw.splitter('\t').drop(4).front).to!long; 65 return Obc(start, end); 66 } 67 /// Columns 4: start; 1-based integer start position of the feature 68 @property start() const { return this.coordinates.start; } 69 /// Column 5: end; closed coordinate integer ending nucleotide position of the feature 70 @property end() const { return this.coordinates.end; } 71 72 /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined." 73 /// Tragically, score can be either a float, or not present (".") 74 /// Totally arbitrarily, we will represent absent as -1 75 @property score() const { 76 if(cast(string)this.raw.splitter('\t').drop(5).front=="."){ 77 return -1.0; 78 } 79 return (cast(string)this.raw.splitter('\t').drop(5).front).to!float; 80 } 81 82 /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown) 83 @property strand() const { 84 return cast(char)this.raw.splitter('\t').drop(6).front[0]; 85 } 86 87 /** Column 8: phase; 88 For features of type "CDS", the phase indicates where the feature begins with 89 reference to the reading frame. The phase is one of the integers 0, 1, or 2, 90 indicating the number of bases that should be removed from the beginning of 91 this feature to reach the first base of the next codon. In other words, a 92 phase of "0" indicates that the next codon begins at the first base of the 93 region described by the current line, a phase of "1" indicates that the next 94 codon begins at the second base of this region, and a phase of "2" indicates 95 that the codon begins at the third base of this region. This is NOT to be 96 confused with the frame, which is simply start modulo 3. 97 98 For forward strand features, phase is counted from the start field. 99 For reverse strand features, phase is counted from the end field. 100 101 The phase is REQUIRED for all CDS features. 102 103 Tragically, phase can be either an integer (0, 1, 2), or not present (".") 104 Totally arbitrarily, we will represent absent as -1 105 **/ 106 @property phase() const { 107 if(cast(string)this.raw.splitter('\t').drop(7).front=="."){ 108 return -1; 109 } 110 return (cast(string)this.raw.splitter('\t').drop(7).front).to!int; 111 } 112 113 /// Column 9: attributes; A list of ;-separated feature attributes in key=value form 114 string attributes(const string field) const { return this.opIndex(field); } 115 116 /// Provides map key lookup semantics for column 9 attributes 117 string opIndex(string field) const { 118 const attrs=this.raw.splitter('\t').drop(8).front.splitter(";"); 119 auto val = attrs // actualy a Range of key=val 120 .filter!(kv => ((cast(string) kv).strip[0 .. (cast(string) kv).strip.countUntil(' ')]) == field); 121 //.front; // -- AssertError if range is empty 122 if (!val.empty) return ((cast(string) val.front).strip[(cast(string) val.front).strip.countUntil(' ') + 1..$]).strip.strip("\""); 123 else return ""; 124 125 /+ Alternative impl -- benchmark (also pull field ~ "=" out of the filter and combine it once upfront) 126 auto vals = attrs 127 .filter!(kv => kv.startsWith(field ~ "=")) 128 .map!(kv => kv[kv.countUntil('=')+1 .. $]); 129 if (!vals.empty) return cast(string) val.front; 130 else return ""; 131 +/ 132 } 133 134 /// Column 9 attributes may also include a comma-sep list of tags: (key:tag)={t1,t2,t3,...} 135 bool hasTag(string tagName)() 136 { 137 return this["tag"].splitter(",").filter!(t => t == tagName).count > 0; 138 } 139 140 /// Computed feature length 141 @property length() const { return this.end - (this.start-1); } 142 /// Relative start === 1 143 @property relativeStart() const { return Ob(1); } 144 /// Relative start === the feature length 145 @property relativeEnd() const { return Ob(this.length); } 146 147 /// Genomic coordinate at offset into feature, taking strandedness into account 148 @property coordinateAtOffset(long offset) const 149 { 150 // GFF3 features are 1-based coordinates 151 assert(offset > 0); 152 offset--; // necessary in a 1-based closed coordinate system 153 154 // for - strand count from end; for +, ., and ? strand count from start 155 immutable auto begin = (this.strand == '-' ? this.end : this.start); 156 157 // for - strand count backward; for +, ., and ? strand count forward 158 immutable auto direction = (this.strand == '-' ? -1 : 1); 159 160 return Ob(begin + (direction * offset)); 161 } 162 /// Genomic coordinate at beginning of feature, taking strandedness into account 163 @property coordinateAtBegin() const 164 { 165 return this.coordinateAtOffset(1); 166 } 167 /// Genomic coordinate at end of feature, taking strandedness into account 168 @property coordinateAtEnd() const 169 { 170 return this.coordinateAtOffset(this.length); 171 } 172 173 string toString() const { 174 return cast(string) this.raw; 175 } 176 /// Returns a string with the canonical "chr:start-end" representation 177 @property string canonicalRepresentation() const { 178 return this.seqid~":"~this.start.pos.to!string~"-"~this.end.pos.to!string; 179 } 180 /// Return the seqURI representation 181 @property string seqURI() const { 182 return format("seq:unk/%s", this.canonicalRepresentation); 183 } 184 185 } 186 unittest{ 187 auto rec = GTF_Record("chr1\tHAVANA\tgene\t11869\t14409\t.\t+\t.\tID \"ENSG00000223972.5\" ; gene_id ENSG00000223972.5 ; gene_id ENSG00000223972.5 ; gene_type transcribed_unprocessed_pseudogene ; gene_name DDX11L1 ; level 2 ; havana_gene OTTHUMG00000000961.2"); // @suppress(dscanner.style.long_line) 188 auto rec_neg= GTF_Record("chr1\tHAVANA\tgene\t11869\t14409\t.\t-\t.\tID \"ENSG00000223972.5\" ; gene_id ENSG00000223972.5 ; gene_id ENSG00000223972.5 ; gene_type transcribed_unprocessed_pseudogene ; gene_name DDX11L1 ; level 2 ; havana_gene OTTHUMG00000000961.2"); // @suppress(dscanner.style.long_line) 189 190 assert(rec.seqid=="chr1"); 191 assert(rec.source=="HAVANA"); 192 assert(rec.type=="gene"); 193 assert(rec.start==11_869); 194 assert(rec.end==14_409); 195 assert(rec.score==-1.0); 196 assert(rec.strand()=='+'); 197 assert(rec.phase==-1); 198 assert(rec["ID"] == "ENSG00000223972.5"); 199 assert(rec["gene_id"] == "ENSG00000223972.5"); 200 assert(rec["gene_type"] == "transcribed_unprocessed_pseudogene"); 201 assert(rec["gene_name"] == "DDX11L1"); 202 assert(rec["level"] == "2"); 203 assert(rec["havana_gene"] == "OTTHUMG00000000961.2"); 204 205 assert(rec.length == 2541); 206 assert(rec.relativeStart == 1); 207 assert(rec.relativeEnd == 2541); 208 209 // Test forward and backward offsets 210 assert(rec.coordinateAtOffset(2) == 11_870); 211 assert(rec_neg.coordinateAtOffset(2) == 14_408); 212 213 assert(rec.coordinateAtBegin == 11_869); 214 assert(rec.coordinateAtEnd == 14_409); 215 216 assert(rec_neg.coordinateAtBegin == 14_409); 217 assert(rec_neg.coordinateAtEnd == 11_869); 218 219 // TODO validator 220 assert(rec.isValid); 221 }