1 // Robert Burner Schadek rburners@gmail.com LGPL3
2 module xmltokenrange;
3 
4 //import std.array : Appender, appender, front, empty, popFront;
5 import std.array;
6 import std.algorithm : equal, count, countUntil;
7 import std.conv : to;
8 import std.encoding : index;
9 import std.exception : enforce;
10 import std.stdio : writeln, writefln;
11 import std.uni : isWhite, isNumber;
12 import std.range : isInputRange, lockstep;
13 //import std.format : format;
14 //import std.format;
15 import std.string : stripLeft, stripRight, indexOf, CaseSensitive, strip;
16 import std.regex : ctRegex, match, regex, matchAll, popFrontN;
17 import std.traits : isSomeChar, isAssociativeArray;
18 import std.functional : binaryFun;
19 import std.algorithm : min;
20 
21 import std.experimental.logger;
22 import fixedsizehashmap;
23 
24 ptrdiff_t stripLeftIdx(C)(C[] str) @safe pure 
25 {
26 	bool foundSome = false;
27     foreach (i, dchar c; str)
28     {
29         if(!std.uni.isWhite(c)) {
30             return i;
31 		} else {
32 			foundSome = true;
33 		}
34     }
35 
36 	if(foundSome) {
37 		return str.length;
38 	}
39     return 0;
40 }
41 
42 ptrdiff_t indexOfNone(Char,R2)(const(Char)[] haystack, const(R2)[] needles,
43 		const size_t startIdx, CaseSensitive cs = CaseSensitive.yes) @safe pure
44     if (isSomeChar!Char && isSomeChar!R2 && 
45 		is(typeof(binaryFun!"a == b"(haystack.front, needles.front))))
46 {	
47     if (startIdx < haystack.length)
48     {
49         ptrdiff_t foundIdx = indexOfNone(haystack[startIdx .. $], needles, cs);
50         if (foundIdx != -1)
51         {
52             return foundIdx + cast(ptrdiff_t)startIdx;
53         }
54     }
55     return -1;
56 }
57 
58 ptrdiff_t indexOfNone(Char,R2)(const(Char)[] haystack, const(R2)[] needles,
59 		CaseSensitive cs = CaseSensitive.yes) @safe pure
60     if (isSomeChar!Char && isSomeChar!R2 && 
61 		is(typeof(binaryFun!"a == b"(haystack.front, needles.front))))
62 {
63     if (cs == CaseSensitive.yes)
64     {
65 		foreach (ptrdiff_t i, dchar c; haystack)
66 		{
67 			foreach (dchar o; needles)
68 			{
69 				if (c != o)
70 				{
71 					return i;
72 				}	
73 			}
74 		}
75 	}
76 	else
77 	{
78 		foreach (ptrdiff_t i, dchar c; haystack)
79 		{
80 			dchar cLow = std.uni.toLower(c);
81 			foreach (dchar o; needles)
82 			{
83 				if (cLow != o)
84 				{
85 					return i;
86 				}	
87 			}
88 		}
89 	}
90 
91 	return -1;
92 }
93 
94 ptrdiff_t indexOfAny(Char,R2)(const(Char)[] haystack, const(R2)[] needles,
95 		CaseSensitive cs = CaseSensitive.yes) @safe pure
96     if (isSomeChar!Char && isSomeChar!R2 && 
97 		is(typeof(binaryFun!"a == b"(haystack.front, needles.front))))
98 {
99     if (cs == CaseSensitive.yes)
100     {
101 		foreach (ptrdiff_t i, dchar c; haystack)
102 		{
103 			foreach (dchar o; needles)
104 			{
105 				if (c == o)
106 				{
107 					return i;
108 				}	
109 			}
110 		}
111 	}
112 	else
113 	{
114 		foreach (ptrdiff_t i, dchar c; haystack)
115 		{
116 			dchar cLow = std.uni.toLower(c);
117 			foreach (dchar o; needles)
118 			{
119 				if (cLow == o)
120 				{
121 					return i;
122 				}	
123 			}
124 		}
125 	}
126 
127 	return -1;
128 }
129 
130 unittest {
131 	ptrdiff_t i = "helloWorld".indexOfAny("Wr");
132 	assert(i == 5);
133 	i = "öällo world".indexOfAny("lo ");
134 	assert(i == 4, to!string(i));
135 }
136 
137 ptrdiff_t indexOfAny(Char,R2)(const(Char)[] haystack, const(R2)[] needles,
138 		const size_t startIdx, CaseSensitive cs = CaseSensitive.yes) @safe pure
139     if (isSomeChar!Char && isSomeChar!R2 && 
140 		is(typeof(binaryFun!"a == b"(haystack.front, needles.front))))
141 {	
142     if (startIdx < haystack.length)
143     {
144         ptrdiff_t foundIdx = indexOfAny(haystack[startIdx .. $], needles, cs);
145         if (foundIdx != -1)
146         {
147             return foundIdx + cast(ptrdiff_t)startIdx;
148         }
149     }
150     return -1;
151 }
152 
153 void eatWhitespace(C)(ref C c) @safe pure {
154 	static if(is(C == string)) {
155 		c = c.strip();
156 	} else {
157 		auto idx = stripLeftIdx(c);
158 		if(idx == c.length) {
159 			c = c[idx-1 .. $];
160 		} else {
161 			c = c[idx .. $];
162 		}
163 	}
164 }
165 
166 unittest {
167 	auto s = "    foo";
168 	eatWhitespace(s);
169 	assert(equal(s, "foo"));
170 }
171 
172 string eatKey(C)(ref C c) @trusted pure {
173 	eatWhitespace(c);
174 	auto endOfKey = c.indexOf("=");
175 	enforce(endOfKey != -1);
176 	string name = c[0..endOfKey];
177 	c = c[endOfKey+1 .. $];
178 	
179 	return name.strip();
180 }
181 
182 unittest {
183 	string input = "   \tfoo = ";
184 	auto n = eatKey(input);
185 	assert(n == "foo", "\"" ~ n ~ "\"");
186 	assert(input == "", "\"" ~ input ~ "\"");
187 }
188 
189 string eatAttri(C)(ref C c) @trusted pure {
190 	eatWhitespace(c);
191 	auto firstTick = c.indexOfAny("\"'");
192 	string attri;
193 	if(firstTick != -1) {
194 		dchar foundString = c[firstTick];
195 		c = c[firstTick+1 .. $];
196 
197 		size_t i = 0;
198 		while(true) {
199 			if(i > 0 && c[i] == foundString && c[i-1] != '\\') {
200 				break;
201 			} else if(i == 0 && c[i] == foundString) {
202 				break;
203 			} else {
204 				++i;
205 			}
206 		}
207 
208 		attri = c[0 .. i];
209 		c = c[i .. $];
210 		if(c[0] == foundString) {
211 			c = c[1 .. $];
212 		}
213 		eatWhitespace(c);
214 	} else {
215 		auto i = c.countUntil!(isNumber);
216 		attri = c[0 .. i];
217 		c = c[i+1 .. $];
218 		eatWhitespace(c);
219 	}
220 
221 	return attri;
222 }
223 
224 unittest {
225 	string input = " \"asdf\"  ";
226 	string attri = eatAttri(input);
227 	assert(attri == "asdf", "\"" ~ attri ~ "\" " ~ input);
228 	assert(input.empty, "\"" ~ input ~ "\"");
229 }
230 
231 version(XML_AA) {
232 void insert(T,K,V)(ref T t, K k, V v) @trusted 
233 		if(isAssociativeArray!T) {
234 	t.attributes[k] = v;
235 }
236 
237 bool has(T,K)(ref T t, K k) @trusted nothrow 
238 		if(isAssociativeArray!T) {
239 	return (k in t.attributes) !is null;
240 }
241 } else {
242 bool has(T,K)(ref T t, K k) @trusted nothrow  {
243 	return t.attributes.contains(k);
244 }
245 
246 void insert(T,K,V)(ref T t, K k, V v) @trusted {
247 	t.insert(k,v);
248 }
249 }
250 
251 
252 enum XmlTokenKind {
253 	Invalid,
254 	OpenClose,
255 	Open,
256 	Text,
257 	Comment,
258 	Type,
259 	Close
260 }
261 
262 struct XmlToken {
263 public:
264 	this(string d, size_t l) {
265 		this.data = d;
266 		this.line = l;
267 		if(this.data.length == 0) {
268 			this.kind = XmlTokenKind.Invalid;
269 			return;
270 		}
271 		this.kind = this.getKind();
272 		if(this.kind == XmlTokenKind.Open || this.kind ==
273 				XmlTokenKind.OpenClose || this.kind == XmlTokenKind.Close) {
274 			this.readName();
275 		}
276 		if(this.kind == XmlTokenKind.Open || this.kind ==
277 				XmlTokenKind.OpenClose || this.kind == XmlTokenKind.Close) {
278 			this.readAttributes();
279 		}
280 	}
281 
282 	ref string opIndex(string key) {
283 		version(XML_AA) {
284 			return this.attributes[key];
285 		} else {
286 			return this.attributes[key].value;
287 		}
288 	}
289 
290 	string name;
291 	XmlTokenKind kind = XmlTokenKind.Invalid;
292 	version(XML_AA) {
293 		string[string] attributes;
294 	} else {
295 		FashMap!(string,string) attributes;
296 	}
297 	string data;
298 	size_t line;
299 
300 private:
301 	XmlTokenKind getKind() {
302 		//import std.format : form = format;
303 		//assert(this.data.length, form("no data at line %u", this.line));
304 		assert(this.data.length);
305 		if(this.data[0] != '<') {
306 			return XmlTokenKind.Text;
307 		} else if(this.data[0] == '<') {
308 			this.data.popFront();
309 			if(this.data[0] == '/') {
310 				this.data.popFront();
311 				return XmlTokenKind.Close;
312 			} else if(this.data[0] == '!') {
313 				this.data.popFront();
314 				return XmlTokenKind.Comment;
315 			} else if(this.data[0] == '?') {
316 				this.data.popFront();
317 				return XmlTokenKind.Type;
318 			} else if(this.data.length > 1 && this.data[$-2] == '/') {
319 				return XmlTokenKind.OpenClose;
320 			} else {
321 				return XmlTokenKind.Open;
322 			}
323 		} 
324 		assert(false);
325 	}
326 
327 	ptrdiff_t readNameBeginIdx() pure {
328 		if(this.data.length > 0) {
329 			return this.data.stripLeftIdx();
330 		} else {
331 			return 0;
332 		}
333 	}
334 
335 	ptrdiff_t readNameEndIdx() pure {
336 		auto lowIdx = readNameBeginIdx();
337 		return this.data[lowIdx .. $].indexOfAny(" >/")+lowIdx;
338 	}
339 
340 	void readName() pure {
341 		auto low = this.readNameBeginIdx();
342 		auto high = this.readNameEndIdx();
343 		assert(low <= this.data.length, this.data);
344 		assert(high <= this.data.length, this.data);
345 		if(low < high) {
346 			this.name = this.data[low .. high];
347 			this.data = this.data[high .. $];
348 		} else if(!this.data.empty) {
349 			this.data.popFront();
350 		}
351 	}
352 
353 	void readAttributes() {
354 		//import std.format : form = format;
355 		while(!this.data.empty) {
356 			eatWhitespace(this.data);
357 
358 			auto end = this.data.indexOf(">");
359 			if(end == 0) {
360 				break;
361 			}
362 			end = this.data.indexOf("/>");
363 			if(end == 0) {
364 				break;
365 			}
366 
367 			eatWhitespace(this.data);
368 			string key;
369 		    try	{
370 				key = eatKey(this.data);
371 			} catch(Exception e) {
372 				//assert(false, form("unable to read key at line %u", this.line));
373 				assert(false);
374 			}
375 			eatWhitespace(this.data);
376 			string attri = eatAttri(this.data);
377 			eatWhitespace(this.data);
378 			version(XML_AA) {
379 				this.attributes[key] = attri;
380 			} else {
381 				insert(this.attributes, key, attri);
382 			}
383 		}
384 	}
385 
386 	//static auto re = ctRegex!("\\s*(\\w+)\\s*=\\s*\"(\\w+)\"\\s*");
387 }
388 
389 struct XmlTokenRange(InputRange) {
390 public:
391 	@property InputRange input() {
392 		return input_;
393 	}
394 
395 	@property void input(InputRange i) {
396 		input_ = i;
397 		this.store_ = appender!string();
398 		this.readFromRange();
399 	}
400 
401 	@property auto front() {
402 		return XmlToken(this.store_.data(), line);
403 	}
404 
405 	@property void popFront() {
406 		this.store_ = appender!string();
407 		readFromRange();
408 	}
409 
410 	@property bool empty() {
411 		return this.store_.data().empty && std.array.empty(this.input_);
412 	}
413 
414 private: 
415 	size_t line;
416 
417 	void equalCrocos() {
418 		dchar it;
419 		dchar prev = '\0';
420 		size_t numCrocos = 0;
421 		//foreach(it; this.input_) {
422 		for(; !input_.empty(); input_.popFront()) {
423 			it = input_.front();
424 	
425 			if(input_.front() == '\n' || input_.front() == '\r') {
426 				++this.line;
427 			}
428 			if(it == '<' && prev != '\\') {
429 				++numCrocos;
430 			} else if(it == '>' && prev != '\\') {
431 				--numCrocos;
432 			}
433 	
434 			prev = it;
435 	
436 			if(!numCrocos) {
437 				this.store_.put(it);
438 				input_.popFront();
439 				break;
440 			}
441 			this.store_.put(it);
442 		}
443 	}
444 
445 	void eatTillCroco() {
446 		dchar it;
447 		dchar prev = '\0';
448 		for(; !input_.empty(); input_.popFront()) {
449 			it = input_.front();
450 			if(input_.front() == '\n' || input_.front() == '\r') {
451 				++this.line;
452 			}
453 			if(it == '<' && prev != '\\') {
454 				break;
455 			}
456 			this.store_.put(it);
457 			prev = it;
458 		}
459 	}
460 
461 	void readFromRange() {
462 		eatWhiteSpace();
463 		if(this.input_.empty) {
464 			return;
465 		}
466 
467 		if(this.input_.front == '<') {
468 			equalCrocos();
469 			return;
470 		} else {
471 			eatTillCroco();
472 			return;
473 		}
474 	}
475 
476 	void eatWhiteSpace() {
477 		while(!input_.empty && isWhite(std.array.front(input_))) {
478 			if(input_.front() == '\n' || input_.front() == '\r') {
479 				++this.line;
480 			}
481 			input_.popFront();
482 		}
483 	}
484 
485 	InputRange input_;
486 	Appender!string store_;
487 }
488 
489 auto xmlTokenRange(InputRange)(InputRange input) {
490 	XmlTokenRange!InputRange ret;
491 	ret.input = input;
492    	return ret;	
493 }
494 
495 unittest {
496 	auto s = "some fun string<>";
497 	auto r = xmlTokenRange(s);
498 
499 	auto f = r.front();
500 	assert(f.kind == XmlTokenKind.Text);
501 	r.popFront();
502 	f = r.front();
503 	assert(f.kind == XmlTokenKind.Open, to!string(f.kind));
504 }
505 
506 unittest {
507 	static assert(isInputRange!(XmlTokenRange!string));
508 }
509 
510 unittest {
511 	string testString = "hello";
512 	auto r = xmlTokenRange(testString);
513 	assert(r.front.data == "hello", r.front.data);
514 }
515 
516 unittest {
517 	string testString = "<hello>";
518 	auto r = xmlTokenRange(testString);
519 	assert(r.front.name == "hello", r.front.name);
520 }
521 
522 unittest {
523 	string testString = "<hello/>";
524 	auto r = xmlTokenRange(testString);
525 	assert(r.front.kind == XmlTokenKind.OpenClose);
526 	assert(r.front.name == "hello", "\"" ~ r.front.name ~ "\"");
527 }
528 
529 unittest {
530 	string testString = "</hello>";
531 	auto r = xmlTokenRange(testString);
532 	assert(r.front.kind == XmlTokenKind.Close);
533 	assert(r.front.name == "hello", "\"" ~ r.front.name ~ "\"");
534 }
535 
536 unittest {
537 	string testString = "<hello>";
538 	string testString2 = "<hello>";
539 	auto test = testString ~ testString2;
540 	auto r = xmlTokenRange(test);
541 }
542 
543 unittest {
544 	string testString = "<hello zzz=\"ttt\" world=\"foo\" args=\"bar\">";
545 	foreach(it; xmlTokenRange(testString)) {
546 		foreach(key, value; it.attributes) {
547 			//writefln("%s %s", key, value);
548 		}
549 	}
550 }