1 module libd.datastructures..string;
2 import libd.memory, libd.datastructures, libd.util.errorhandling;
3 
4 version(X86_64){}
5 else static assert(false, "libd's string is only compatible on x86_64, at least for the foreseeable future.");
6 
7 // For now, String will just use g_alloc because I feel having different strings with different allocators isn't actually worth it?
8 // If needed libd can provide a specialised allocator for strings.
9 struct String
10 {
11     /++
12         So, this abuses a few things:
13             * Pointers only actually use 48-bits, with the upper 16-bits being sign extended.
14             * The 47th bit is pretty much always 0 in user mode, so the upper 16-bits are also 0.
15             * Little endian is an important metric for why the pointer is put last, and x86_64 is a little endian architecture.
16 
17         For 'small' strings:
18             * Bytes 0-22 contain the small string.
19             * Byte 23 contains the null terminator (since I want libd's strings to always provide one without reallocation needed - cheaper integration with C libs).
20             * Byte 24 contains the 'small' length, which will always be non-0 for small strings.
21 
22         For 'big' strings:
23             * Bytes 0-8 contain the length.
24             * Bytes 8-16 contain the capacity.
25             * Bytes 16-24 contain the allocated pointer.
26                 * Because of little endian, and the fact the upper 16-bits of a pointer will be 0, this sets the 'small' length to 0
27                   which we can use as a flag to determine between small and big strings.
28 
29         Special case 'empty':
30             If the string is completely empty, then Bits 16-24 will be all 0, indicating both that there's no 'small' length, and also a null 'big' pointer.
31      + ++/
32     private union Store
33     {
34         struct // smol
35         {
36             // D chars init to 0xFF (invalid utf-8 character), which we don't want here.
37             // Although, because it's in a union, I don't actually know how D inits this memory by default. Better safe than sorry.
38             char[22] smallString   = '\0';
39             char     smallNullTerm = '\0';
40             ubyte    smallLength;
41         }
42 
43         struct // big
44         {
45             size_t bigLength;
46             size_t bigCapacity;
47             char*  bigPtr;
48         }
49     }
50     private Store _store;
51     static assert(typeof(this).sizeof == 24, "String isn't 24 bytes anymore :(");
52 
53     private alias _alloc = g_alloc;
54     private alias Grow   = DefaultGrowth;
55 
56     @nogc nothrow:
57 
58     @trusted
59     this(string str)
60     {
61         this = str;
62     }
63 
64     @trusted // Bounds checking + always confirming pointer validity *should* make this safe.
65     this(bcstring str)
66     {
67         this = str;
68     }
69 
70     this(const char* ptr)
71     {
72         import runtime.dynamicfuncs : strlen;
73         const length = strlen(ptr);
74         this.put(ptr[0..length]);
75     }
76 
77     this(this) @trusted
78     {
79         if(!this.isCompletelyEmpty && !this.isSmall)
80         {
81             auto slice = this._alloc.makeArray!char(this._store.bigLength+1).slice; // We'll just allocate the length and not use Growth or capacity.
82             if(slice is null)
83                 onOutOfMemoryError(slice.ptr);
84             slice[0..$-1] = this._store.bigPtr[0..this._store.bigLength];
85             slice[$-1]    = '\0';
86             this._store.bigPtr      = slice.ptr;
87             this._store.bigLength   = slice.length-1; // Otherwise we include the null term in this value, which we don't do.
88             this._store.bigCapacity = slice.length-1; // ^^^
89         }
90     }
91 
92     @trusted
93     ~this()
94     {
95         this.disposeBigStringIfExists();
96         this._store = Store.init;
97     }
98 
99     @trusted
100     void putMany(Params...)(scope Params params)
101     {
102         foreach(param; params)
103             this.put(param);
104     }
105     
106     @trusted // This is technically safe by itself due to all the checks, but `chars` might point to bad memory. Can't express that in D though.
107     void put(scope bcstring chars)
108     {
109         auto newLength = chars.length;
110         if(this.isSmall)
111             newLength += this._store.smallLength;
112         else
113             newLength += this._store.bigLength;
114 
115         if(this.isSmall || this.isCompletelyEmpty)
116         {
117             if(newLength <= this._store.smallString.length)
118             {
119                 const start = this._store.smallLength;
120                 this._store.smallString[start..start + chars.length] = chars[0..$];
121                 this._store.smallLength += chars.length;
122                 return;
123             }
124 
125             this.moveToBigString();
126         }
127 
128         this.growBigStringIfNeeded(newLength+1); // +1 for null term.
129         const start = this._store.bigLength;
130         this._store.bigPtr[start..start+chars.length] = chars[0..$];
131         this._store.bigLength += chars.length;
132         this._store.bigPtr[this._store.bigLength] = '\0';
133     }
134 
135     @trusted
136     void put()(scope const auto ref String str)
137     {
138         this.put(str.sliceUnsafe);
139     }
140 
141     @trusted
142     void put(char ch)
143     {
144         char[] fakeArray = (&ch)[0..1];
145         this.put(fakeArray);
146     }
147 
148     void put(Range)(Range r)
149     if(!is(Range : bcstring))
150     {
151         foreach(value; r)
152             this.put(value);
153     }
154 
155     @trusted
156     bool opEquals(scope bcstring other) const
157     {
158         return __equals(this.sliceUnsafe, other);
159     }
160 
161     @trusted
162     bool opEquals()(scope auto ref const String other) const
163     {
164         return this.sliceUnsafe == other.sliceUnsafe;
165     }
166 
167     @safe
168     bool opEquals(typeof(null) _) const
169     {
170         return this.isCompletelyEmpty;
171     }
172 
173     @trusted
174     void opAssign(bcstring str)
175     {   
176         if(str is null)
177             this = null;
178         else if(str.length <= this._store.smallString.length)
179             this.setSmallString(str);
180         else
181             this.setBigString(str);
182     }
183 
184     @trusted
185     void opAssign(typeof(null) _)
186     {
187         this.__xdtor();
188     }
189 
190     @safe
191     size_t opDollar() const
192     {
193         return this.length;
194     }
195 
196     @trusted
197     bcstring opIndex() const
198     {
199         return this.sliceUnsafe;
200     }
201 
202     @trusted
203     char opIndex(size_t index) const
204     {
205         assert(index < this.length, "Index is out of bounds.");
206         return this.sliceUnsafe[index];
207     }
208 
209     @trusted // Function is @safe, further usage by user is not.
210     bcstring opSlice(size_t start, size_t end) const
211     {
212         assert(end <= this.length, "End index is out of bounds.");
213         assert(start <= end, "Start index is greater than End index.");
214         return this.sliceUnsafe[start..end];
215     }
216 
217     @trusted // HEAVILY assumes that the allocated memory is still valid. Since at the moment we always use g_alloc, this should be guarenteed outside of bugs in this struct.
218     void opIndexAssign(char v, size_t index)
219     {
220         assert(index < this.length, "Index is out of bounds.");
221         cast()this.sliceUnsafe[index] = v; // cast away const is fine for internal functions like this.
222     }
223 
224     @trusted
225     void opSliceAssign(char v, size_t start, size_t end)
226     {
227         auto slice = cast(char[])this[start..end];
228         slice[] = v;
229     }
230 
231     @trusted
232     void opSliceAssign(bcstring str, size_t start, size_t end)
233     {
234         auto slice = cast(char[])this[start..end];
235         assert(end - start == str.length, "Mismatch between str.length, and (end - start).");
236         slice[0..$] = str[0..$];
237     }
238 
239     @trusted
240     String opBinary(string op)(const scope auto ref String rhs) const
241     if(op == "~")
242     {
243         String ret = cast()this; // NRVO better come into play here.
244         ret.put(rhs);
245         return ret;
246     }
247 
248     @trusted
249     String opBinary(string op)(scope bcstring rhs) const
250     if(op == "~")
251     {
252         String ret = cast()this; // NRVO better come into play here.
253         ret.put(rhs);
254         return ret;
255     }
256 
257     @trusted
258     void opOpAssign(string op)(const scope auto ref String rhs)
259     if(op == "~")
260     {
261         this.put(rhs);
262     }
263 
264     @trusted
265     void opOpAssign(string op)(scope bcstring rhs)
266     if(op == "~")
267     {
268         this.put(rhs);
269     }
270 
271     alias range = rangeUnsafe; // Could probably make a safe/safer custom range in the future.
272     @property
273     bcstring rangeUnsafe() const
274     {
275         return this.sliceUnsafe;
276     }
277 
278     @property @safe
279     size_t length() const
280     {
281         return (this.isSmall) ? this._store.smallLength : this._store.bigLength; 
282     }
283 
284     @property
285     void length(size_t newLen)
286     {
287         if(this.isSmall || this.isCompletelyEmpty)
288         {
289             if(newLen > this._store.smallString.length)
290             {
291                 this.moveToBigString();
292                 this.length = newLen; // So we don't have to duplicate logic.
293             }
294             else
295                 this._store.smallLength = cast(ubyte)newLen;
296             return;
297         }
298 
299         // Lazy choice: Once we're a big string, we're always a big string.
300         //              Will eventually *not* do this, but >x3
301         if(newLen > this._store.bigLength)
302         {
303             const start = this._store.bigLength;
304             this.growBigStringIfNeeded(newLen);
305             this._store.bigPtr[start..newLen] = char.init;
306         }
307 
308         this._store.bigLength = newLen;
309         this._store.bigPtr[newLen] = '\0';
310     }
311 
312     @property
313     const(char)* ptrUnsafe() const return
314     {
315         return (this.isSmall) ? &this._store.smallString[0] : this._store.bigPtr;
316     }
317 
318     @property
319     bcstring sliceUnsafe() const return
320     {
321         return (this.isSmall) ? this._store.smallString[0..this._store.smallLength] : this._store.bigPtr[0..this._store.bigLength];
322     }
323 
324     @trusted
325     private void setSmallString(scope bcstring chars)
326     {
327         assert(chars.length <= this._store.smallString.length);
328         this.disposeBigStringIfExists(); // Resets us to a "completely empty" state.
329         this._store.smallString[0..chars.length] = chars[0..$];
330         this._store.smallLength = cast(ubyte)chars.length;
331     }
332 
333     @trusted
334     private void setBigString(scope bcstring chars)
335     {
336         this.growBigStringIfNeeded(chars.length+1); // +1 for null term.
337         assert(this._store.smallLength == 0, "Nani?");
338         this._store.bigLength               = chars.length;
339         this._store.bigPtr[0..chars.length] = chars[0..$];
340         this._store.bigPtr[chars.length]    = '\0';
341         assert(!this.isSmall, "Eh?");
342     }
343 
344     @trusted
345     private void moveToBigString()
346     {
347         if(this.isCompletelyEmpty || !this.isSmall)
348             return;
349 
350         // Have to copy into a buffer first, otherwise setBigString will overwrite the string data before it ends up copying it.
351         char[22] buffer;
352         buffer[0..$]      = this._store.smallString[0..$];
353         const smallLength = this._store.smallLength;
354         this.setBigString(buffer[0..smallLength]);
355     }
356 
357     @trusted
358     private void growBigStringIfNeeded(size_t newSize)
359     {
360         if(this.isCompletelyEmpty || this.isSmall)
361         {
362             this._store.bigCapacity = Grow.grow(newSize);
363             this._store.bigPtr      = this._alloc.makeArray!char(this._store.bigCapacity).ptr;
364             if(this._store.bigPtr is null)
365                 onOutOfMemoryError(null);
366             return;
367         }
368 
369         if(newSize > this._store.bigCapacity)
370         {
371             const oldCapacity       = this._store.bigCapacity;
372             this._store.bigCapacity = Grow.grow(newSize);
373             this._store.bigPtr      = this._alloc.growArray!char(
374                 this._store.bigCapacity, 
375                 this._store.bigPtr[0..oldCapacity].notNull!(_alloc.Tag)
376             ).ptr;
377             if(this._store.bigPtr is null)
378                 onOutOfMemoryError(null);
379         }
380     }
381 
382     @trusted
383     private void disposeBigStringIfExists()
384     {
385         if(!this.isCompletelyEmpty && !this.isSmall)
386         {
387             this._alloc.dispose(this._store.bigPtr); // set to null by .dispose
388             this._store.smallString[] = '\0';
389             assert(this.isCompletelyEmpty, "?");
390         }
391     }
392 
393     @trusted
394     private bool isCompletelyEmpty() const
395     {
396         return this._store.bigPtr is null;
397     }
398 
399     @safe
400     private bool isSmall() const
401     {
402         return this._store.smallLength > 0;
403     }
404 }
405 ///
406 @("String")
407 unittest
408 {
409     auto s = String("Hello");
410     assert(s.isSmall); // .isSmall is a private function
411     assert(!s.isCompletelyEmpty); // ^^^
412     assert(s.length == 5);
413     assert(s == "Hello");
414     assert(s.ptrUnsafe[5] == '\0');
415 
416     auto s2 = s;
417     assert(s2.isSmall && !s2.isCompletelyEmpty);
418     assert(s2.length == 5);
419     assert(s2 == "Hello");
420     s2.put(", world!");
421     assert(s2.length == 13);
422     assert(s.length == 5);
423     assert(s2 == "Hello, world!");
424     assert(s2.ptrUnsafe[13] == '\0');
425 
426     s = String("This is a big string that is bigger than 22 characters long!");
427     assert(!s.isSmall);
428     assert(s.length == 60);
429     assert(s == "This is a big string that is bigger than 22 characters long!");
430     assert(s.ptrUnsafe[60] == '\0');
431 
432     s2 = s;
433     assert(!s2.isSmall);
434     assert(s2.length == 60);
435     assert(s2._store.bigPtr !is s._store.bigPtr);
436     s.__xdtor();
437     s2.put("This shouldn't crash because we copied things.");
438     assert(s2 == "This is a big string that is bigger than 22 characters long!This shouldn't crash because we copied things.");
439     assert(s2.ptrUnsafe[s2.length] == '\0');
440 
441     s2.length = 60;
442     assert(s2.length == 60);
443     assert(s2 == "This is a big string that is bigger than 22 characters long!");
444     assert(s2.ptrUnsafe[60] == '\0');
445 
446     s2.length = 61;
447     assert(s2 == "This is a big string that is bigger than 22 characters long!"~char.init);
448     assert(s2.ptrUnsafe[61] == '\0');
449 
450     // Making sure we don't crash when using any of these things from a .init state.
451     s2.__xdtor();
452     assert(!s2.isSmall && s2.isCompletelyEmpty);
453     assert(s2.ptrUnsafe is null);
454     assert(s2.sliceUnsafe is null);
455     assert(s2.length == 0);
456     s2.put("abc");
457     assert(s2.isSmall);
458 
459     assert(s2 == "abc");
460     assert(s2 == String("abc"));
461     assert(s2 != null);
462     s2 = null;
463     assert(s2 == null);
464 
465     s2 = "abc";
466     assert(s2.isSmall);
467     assert(s2 == "abc");
468     assert(s2[1] == 'b');
469     assert(s2[0..2] == "ab");
470     assert(s2[3..3].length == 0);
471     assert(s2[] == "abc");
472 
473     s2[1] = 'd';
474     assert(s2 == "adc");
475     s2[0..2] = 'b';
476     assert(s2 == "bbc");
477     s2[0..3] = "put";
478     assert(s2 == "put");
479 
480     assert(s2 ~ "in" == "putin");
481     assert(s2 ~ String("ty") == "putty");
482     s2 ~= " it ";
483     assert(s2 == "put it ");
484     s2 ~= String("in mah belleh");
485     assert(s2 == "put it in mah belleh");
486 }