1 module gff3d.gtfrecord;
2 
3 import std.stdio;
4 import std.algorithm.iteration: splitter;
5 import std.stdio: chunks, File;
6 import std.algorithm: filter,map;
7 import std.traits: ReturnType;
8 import std.array: replace;
9 import std.conv: to;
10 import std.string;
11 import std.file: mkdir;
12 import std.algorithm.searching: countUntil;
13 import std.range: drop, ForwardRange, chain;
14 import std.format : format;
15 
16 import dhtslib.coordinates;
17 
18 //import dhtslib.htslib.hts_log;
19 
20 /** GTF Record
21 
22 Format Documentation:
23  *  http://gmod.org/wiki/GFF2#The_GFF2_File_Format
24  *  https://useast.ensembl.org/info/website/upload/gff.html
25  *  
26  *  TODO: make sortable
27  *  TODO: Make record builder (i.e. start with blank record and add attrs) to prep for writing
28  */
29 struct GTF_Record
30 {
31     private ubyte[] raw;
32 
33     /// constructor (raw ubytes)
34     this(ubyte[]data){
35         this.raw = data;
36     }
37     /// constructor (string)
38     this(string data){
39         this.raw = cast(ubyte[])data;
40     }
41 
42     /// TODO: Not implemented; (almost) always true
43     @property bool isValid()
44     {
45         //hts_log_trace(__FUNCTION__, format("raw.length %d", raw.length));
46         return (raw.length >= 0 ? true : false);
47     }
48 
49     /// Column 1: seqid (aka contig); basis for the coordinate system
50     @property seqid() const { return cast(string)this.raw.splitter('\t').front; }
51     /// ditto
52     @property contig() const{ return cast(string)this.raw.splitter('\t').front; }
53 
54     /// Column 2: source; software, procedure, or database originating the record
55     @property source() const { return cast(string)this.raw.splitter('\t').drop(1).front; }
56 
57     /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number
58     @property type() const { return cast(string)this.raw.splitter('\t').drop(2).front; }
59 
60     /// Columns 4 & 5: returns Coordinate set: Obc format
61     @property coordinates() const
62     {
63         auto start = (cast(string)this.raw.splitter('\t').drop(3).front).to!long;
64         auto end = (cast(string)this.raw.splitter('\t').drop(4).front).to!long;
65         return Obc(start, end);
66     }
67     /// Columns 4: start; 1-based integer start position of the feature
68     @property start() const { return this.coordinates.start; }
69     /// Column 5: end; closed coordinate integer ending nucleotide position of the feature
70     @property end() const { return this.coordinates.end; }
71 
72     /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined."
73     /// Tragically, score can be either a float, or not present (".")
74     /// Totally arbitrarily, we will represent absent as -1
75     @property score() const {
76         if(cast(string)this.raw.splitter('\t').drop(5).front=="."){
77             return -1.0;
78         }
79         return (cast(string)this.raw.splitter('\t').drop(5).front).to!float;
80     }
81 
82     /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown)
83     @property strand() const {
84         return cast(char)this.raw.splitter('\t').drop(6).front[0];
85     }
86 
87     /** Column 8: phase;
88     For features of type "CDS", the phase indicates where the feature begins with
89     reference to the reading frame. The phase is one of the integers 0, 1, or 2,
90     indicating the number of bases that should be removed from the beginning of
91     this feature to reach the first base of the next codon. In other words, a
92     phase of "0" indicates that the next codon begins at the first base of the
93     region described by the current line, a phase of "1" indicates that the next
94     codon begins at the second base of this region, and a phase of "2" indicates
95     that the codon begins at the third base of this region. This is NOT to be
96     confused with the frame, which is simply start modulo 3.
97 
98     For forward strand features, phase is counted from the start field.
99     For reverse strand features, phase is counted from the end field.
100 
101     The phase is REQUIRED for all CDS features.
102     
103     Tragically, phase can be either an integer (0, 1, 2), or not present (".")
104     Totally arbitrarily, we will represent absent as -1
105     **/
106     @property phase() const {
107         if(cast(string)this.raw.splitter('\t').drop(7).front=="."){
108             return -1;
109         }
110         return (cast(string)this.raw.splitter('\t').drop(7).front).to!int;
111     }
112 
113     /// Column 9: attributes; A list of ;-separated feature attributes in key=value form
114     string attributes(const string field) const { return this.opIndex(field); }
115 
116     /// Provides map key lookup semantics for column 9 attributes
117     string opIndex(string field) const {
118         const attrs=this.raw.splitter('\t').drop(8).front.splitter(";");
119         auto val = attrs    // actualy a Range of key=val
120             .filter!(kv => ((cast(string) kv).strip[0 .. (cast(string) kv).strip.countUntil(' ')]) == field);
121             //.front; // -- AssertError if range is empty
122         if (!val.empty) return ((cast(string) val.front).strip[(cast(string) val.front).strip.countUntil(' ') + 1..$]).strip.strip("\"");
123         else return "";
124 
125         /+ Alternative impl -- benchmark (also pull field ~ "=" out of the filter and combine it once upfront)
126         auto vals = attrs
127                     .filter!(kv => kv.startsWith(field ~ "="))
128                     .map!(kv => kv[kv.countUntil('=')+1 .. $]);
129         if (!vals.empty) return cast(string) val.front;
130         else return "";
131         +/
132     }
133 
134     /// Column 9 attributes may also include a comma-sep list of tags: (key:tag)={t1,t2,t3,...}
135     bool hasTag(string tagName)()
136     {
137         return this["tag"].splitter(",").filter!(t => t == tagName).count > 0;
138     }
139 
140     /// Computed feature length
141     @property length() const { return this.end - (this.start-1); }
142     /// Relative start === 1
143     @property relativeStart() const { return Ob(1); }
144     /// Relative start === the feature length
145     @property relativeEnd() const  { return Ob(this.length); }
146 
147     /// Genomic coordinate at offset into feature, taking strandedness into account
148     @property coordinateAtOffset(long offset) const
149     {
150         // GFF3 features are 1-based coordinates
151         assert(offset > 0);
152         offset--;   // necessary in a 1-based closed coordinate system
153         
154         // for - strand count from end; for +, ., and ? strand count from start
155         immutable auto begin = (this.strand == '-' ? this.end : this.start);
156 
157         // for - strand count backward; for +, ., and ? strand count forward
158         immutable auto direction = (this.strand == '-' ? -1 : 1);
159 
160         return Ob(begin + (direction * offset));
161     }
162     /// Genomic coordinate at beginning of feature, taking strandedness into account
163     @property coordinateAtBegin() const
164     {
165         return this.coordinateAtOffset(1);
166     }
167     /// Genomic coordinate at end of feature, taking strandedness into account
168     @property coordinateAtEnd() const
169     {
170         return this.coordinateAtOffset(this.length);
171     }
172 
173     string toString() const {
174         return cast(string) this.raw;
175     }
176     /// Returns a string with the canonical "chr:start-end" representation
177     @property string canonicalRepresentation() const {
178         return this.seqid~":"~this.start.pos.to!string~"-"~this.end.pos.to!string;
179     }
180     /// Return the seqURI representation
181     @property string seqURI() const {
182         return format("seq:unk/%s", this.canonicalRepresentation);
183     }
184 
185 }
186 unittest{
187     auto rec    = GTF_Record("chr1\tHAVANA\tgene\t11869\t14409\t.\t+\t.\tID \"ENSG00000223972.5\" ; gene_id ENSG00000223972.5 ; gene_id ENSG00000223972.5 ; gene_type transcribed_unprocessed_pseudogene ; gene_name DDX11L1 ; level 2 ; havana_gene OTTHUMG00000000961.2"); // @suppress(dscanner.style.long_line)
188     auto rec_neg= GTF_Record("chr1\tHAVANA\tgene\t11869\t14409\t.\t-\t.\tID \"ENSG00000223972.5\" ; gene_id ENSG00000223972.5 ; gene_id ENSG00000223972.5 ; gene_type transcribed_unprocessed_pseudogene ; gene_name DDX11L1 ; level 2 ; havana_gene OTTHUMG00000000961.2"); // @suppress(dscanner.style.long_line)
189 
190     assert(rec.seqid=="chr1");
191     assert(rec.source=="HAVANA");
192     assert(rec.type=="gene");
193     assert(rec.start==11_869);
194     assert(rec.end==14_409);
195     assert(rec.score==-1.0);
196     assert(rec.strand()=='+');
197     assert(rec.phase==-1);
198     assert(rec["ID"] == "ENSG00000223972.5");
199     assert(rec["gene_id"] == "ENSG00000223972.5");
200     assert(rec["gene_type"] == "transcribed_unprocessed_pseudogene");
201     assert(rec["gene_name"] == "DDX11L1");
202     assert(rec["level"] == "2");
203     assert(rec["havana_gene"] == "OTTHUMG00000000961.2");
204 
205     assert(rec.length == 2541);
206     assert(rec.relativeStart == 1);
207     assert(rec.relativeEnd == 2541);
208 
209     // Test forward and backward offsets
210     assert(rec.coordinateAtOffset(2) == 11_870);
211     assert(rec_neg.coordinateAtOffset(2) == 14_408);
212 
213     assert(rec.coordinateAtBegin == 11_869);
214     assert(rec.coordinateAtEnd   == 14_409);
215 
216     assert(rec_neg.coordinateAtBegin == 14_409);
217     assert(rec_neg.coordinateAtEnd   == 11_869);
218 
219     // TODO validator
220     assert(rec.isValid);
221 }