1 module gff3d.gff3record;
2 
3 import std.stdio;
4 import std.algorithm.iteration: splitter;
5 import std.stdio: chunks, File;
6 import std.algorithm: filter,map;
7 import std.traits: ReturnType;
8 import std.array: replace;
9 import std.conv: to;
10 import std.string;
11 import std.file: mkdir;
12 import std.algorithm.searching: countUntil;
13 import std.range: drop, ForwardRange, chain;
14 import std.format : format;
15 
16 import dhtslib.coordinates;
17 
18 //import dhtslib.htslib.hts_log;
19 
20 /** GFF3 Record
21 
22 Format Documentation:
23  *  http://gmod.org/wiki/GFF3
24  *  https://useast.ensembl.org/info/website/upload/gff3.html
25  *  https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
26  *  http://www.sequenceontology.org/gff3.shtml
27  *
28  *  TODO: make sortable
29  *  TODO: Make record builder (i.e. start with blank record and add attrs) to prep for writing
30  */
31 struct GFF3_Record
32 {
33     private ubyte[] raw;
34 
35     /// constructor (raw ubytes)
36     this(ubyte[]data){
37         this.raw = data;
38     }
39     /// constructor (string)
40     this(string data){
41         this.raw = cast(ubyte[])data;
42     }
43 
44     /// TODO: Not implemented; (almost) always true
45     @property bool isValid()
46     {
47         //hts_log_trace(__FUNCTION__, format("raw.length %d", raw.length));
48         return (raw.length >= 0 ? true : false);
49     }
50 
51     /// Column 1: seqid (aka contig); basis for the coordinate system
52     @property seqid() const { return cast(string)this.raw.splitter('\t').front; }
53     /// ditto
54     @property contig() const{ return cast(string)this.raw.splitter('\t').front; }
55 
56     /// Column 2: source; software, procedure, or database originating the record
57     @property source() const { return cast(string)this.raw.splitter('\t').drop(1).front; }
58 
59     /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number
60     @property type() const { return cast(string)this.raw.splitter('\t').drop(2).front; }
61 
62     /// Columns 4 & 5: returns Coordinate set: Obc format
63     @property coordinates() const
64     {
65         auto start = (cast(string)this.raw.splitter('\t').drop(3).front).to!long;
66         auto end = (cast(string)this.raw.splitter('\t').drop(4).front).to!long;
67         return Obc(start, end);
68     }
69     /// Columns 4: start; 1-based integer start position of the feature
70     @property start() const { return this.coordinates.start; }
71     /// Column 5: end; closed coordinate integer ending nucleotide position of the feature
72     @property end() const { return this.coordinates.end; }
73 
74     /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined."
75     /// Tragically, score can be either a float, or not present (".")
76     /// Totally arbitrarily, we will represent absent as -1
77     @property score() const {
78         if(cast(string)this.raw.splitter('\t').drop(5).front=="."){
79             return -1.0;
80         }
81         return (cast(string)this.raw.splitter('\t').drop(5).front).to!float;
82     }
83 
84     /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown)
85     @property strand() const {
86         return cast(char)this.raw.splitter('\t').drop(6).front[0];
87     }
88 
89     /** Column 8: phase;
90     For features of type "CDS", the phase indicates where the feature begins with
91     reference to the reading frame. The phase is one of the integers 0, 1, or 2,
92     indicating the number of bases that should be removed from the beginning of
93     this feature to reach the first base of the next codon. In other words, a
94     phase of "0" indicates that the next codon begins at the first base of the
95     region described by the current line, a phase of "1" indicates that the next
96     codon begins at the second base of this region, and a phase of "2" indicates
97     that the codon begins at the third base of this region. This is NOT to be
98     confused with the frame, which is simply start modulo 3.
99 
100     For forward strand features, phase is counted from the start field.
101     For reverse strand features, phase is counted from the end field.
102 
103     The phase is REQUIRED for all CDS features.
104     
105     Tragically, phase can be either an integer (0, 1, 2), or not present (".")
106     Totally arbitrarily, we will represent absent as -1
107     **/
108     @property phase() const {
109         if(cast(string)this.raw.splitter('\t').drop(7).front=="."){
110             return -1;
111         }
112         return (cast(string)this.raw.splitter('\t').drop(7).front).to!int;
113     }
114 
115     /// Column 9: attributes; A list of ;-separated feature attributes in key=value form
116     string attributes(const string field) const { return this.opIndex(field); }
117 
118     /// Provides map key lookup semantics for column 9 attributes
119     string opIndex(string field) const {
120         const attrs=this.raw.splitter('\t').drop(8).front.splitter(";");
121         auto val = attrs    // actualy a Range of key=val
122             .filter!(kv => cast(string)(kv[0 .. kv.countUntil('=')]) == field);
123             //.front; // -- AssertError if range is empty
124         if (!val.empty) return cast(string) (val.front[val.front.countUntil('=')+1..$]);
125         else return "";
126 
127         /+ Alternative impl -- benchmark (also pull field ~ "=" out of the filter and combine it once upfront)
128         auto vals = attrs
129                     .filter!(kv => kv.startsWith(field ~ "="))
130                     .map!(kv => kv[kv.countUntil('=')+1 .. $]);
131         if (!vals.empty) return cast(string) val.front;
132         else return "";
133         +/
134     }
135 
136     /// Column 9 attributes may also include a comma-sep list of tags: (key:tag)={t1,t2,t3,...}
137     bool hasTag(string tagName)()
138     {
139         return this["tag"].splitter(",").filter!(t => t == tagName).count > 0;
140     }
141 
142     /// Computed feature length
143     @property length() const { return this.end - (this.start-1); }
144     /// Relative start === 1
145     @property relativeStart() const { return Ob(1); }
146     /// Relative start === the feature length
147     @property relativeEnd() const  { return Ob(this.length); }
148 
149     /// Genomic coordinate at offset into feature, taking strandedness into account
150     @property coordinateAtOffset(long offset) const
151     {
152         // GFF3 features are 1-based coordinates
153         assert(offset > 0);
154         offset--;   // necessary in a 1-based closed coordinate system
155         
156         // for - strand count from end; for +, ., and ? strand count from start
157         immutable auto begin = (this.strand == '-' ? this.end : this.start);
158 
159         // for - strand count backward; for +, ., and ? strand count forward
160         immutable auto direction = (this.strand == '-' ? -1 : 1);
161 
162         return Ob(begin + (direction * offset));
163     }
164     /// Genomic coordinate at beginning of feature, taking strandedness into account
165     @property coordinateAtBegin() const
166     {
167         return this.coordinateAtOffset(1);
168     }
169     /// Genomic coordinate at end of feature, taking strandedness into account
170     @property coordinateAtEnd() const
171     {
172         return this.coordinateAtOffset(this.length);
173     }
174 
175     string toString() const {
176         return cast(string) this.raw;
177     }
178     /// Returns a string with the canonical "chr:start-end" representation
179     @property string canonicalRepresentation() const {
180         return this.seqid~":"~this.start.pos.to!string~"-"~this.end.pos.to!string;
181     }
182     /// Return the seqURI representation
183     @property string seqURI() const {
184         return format("seq:unk/%s", this.canonicalRepresentation);
185     }
186 
187 }
188 unittest{
189     auto rec    = GFF3_Record("chr1\tHAVANA\tgene\t11869\t14409\t.\t+\t.\tID=ENSG00000223972.5;gene_id=ENSG00000223972.5;gene_id=ENSG00000223972.5;gene_type=transcribed_unprocessed_pseudogene;gene_name=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2"); // @suppress(dscanner.style.long_line)
190     auto rec_neg= GFF3_Record("chr1\tHAVANA\tgene\t11869\t14409\t.\t-\t.\tID=ENSG00000223972.5;gene_id=ENSG00000223972.5;gene_id=ENSG00000223972.5;gene_type=transcribed_unprocessed_pseudogene;gene_name=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2"); // @suppress(dscanner.style.long_line)
191 
192     assert(rec.seqid=="chr1");
193     assert(rec.source=="HAVANA");
194     assert(rec.type=="gene");
195     assert(rec.start==11_869);
196     assert(rec.end==14_409);
197     assert(rec.score==-1.0);
198     assert(rec.strand()=='+');
199     assert(rec.phase==-1);
200     assert(rec["ID"] == "ENSG00000223972.5");
201     assert(rec["gene_id"] == "ENSG00000223972.5");
202     assert(rec["gene_type"] == "transcribed_unprocessed_pseudogene");
203     assert(rec["gene_name"] == "DDX11L1");
204     assert(rec["level"] == "2");
205     assert(rec["havana_gene"] == "OTTHUMG00000000961.2");
206 
207     assert(rec.length == 2541);
208     assert(rec.relativeStart == 1);
209     assert(rec.relativeEnd == 2541);
210 
211     // Test forward and backward offsets
212     assert(rec.coordinateAtOffset(2) == 11_870);
213     assert(rec_neg.coordinateAtOffset(2) == 14_408);
214 
215     assert(rec.coordinateAtBegin == 11_869);
216     assert(rec.coordinateAtEnd   == 14_409);
217 
218     assert(rec_neg.coordinateAtBegin == 14_409);
219     assert(rec_neg.coordinateAtEnd   == 11_869);
220 
221     // TODO validator
222     assert(rec.isValid);
223 }