1 // Robert Burner Schadek rburners@gmail.com LGPL3 2 module xmltokenrange; 3 4 //import std.array : Appender, appender, front, empty, popFront; 5 import std.array; 6 import std.algorithm : equal, count, countUntil; 7 import std.conv : to; 8 import std.encoding : index; 9 import std.exception : enforce; 10 import std.stdio : writeln, writefln; 11 import std.uni : isWhite, isNumber; 12 import std.range : isInputRange, lockstep; 13 //import std.format : format; 14 //import std.format; 15 import std.string : stripLeft, stripRight, indexOf, CaseSensitive, strip; 16 import std.regex : ctRegex, match, regex, matchAll, popFrontN; 17 import std.traits : isSomeChar, isAssociativeArray; 18 import std.functional : binaryFun; 19 import std.algorithm : min; 20 21 import std.experimental.logger; 22 import fixedsizehashmap; 23 24 ptrdiff_t stripLeftIdx(C)(C[] str) @safe pure 25 { 26 bool foundSome = false; 27 foreach (i, dchar c; str) 28 { 29 if(!std.uni.isWhite(c)) { 30 return i; 31 } else { 32 foundSome = true; 33 } 34 } 35 36 if(foundSome) { 37 return str.length; 38 } 39 return 0; 40 } 41 42 ptrdiff_t indexOfNone(Char,R2)(const(Char)[] haystack, const(R2)[] needles, 43 const size_t startIdx, CaseSensitive cs = CaseSensitive.yes) @safe pure 44 if (isSomeChar!Char && isSomeChar!R2 && 45 is(typeof(binaryFun!"a == b"(haystack.front, needles.front)))) 46 { 47 if (startIdx < haystack.length) 48 { 49 ptrdiff_t foundIdx = indexOfNone(haystack[startIdx .. $], needles, cs); 50 if (foundIdx != -1) 51 { 52 return foundIdx + cast(ptrdiff_t)startIdx; 53 } 54 } 55 return -1; 56 } 57 58 ptrdiff_t indexOfNone(Char,R2)(const(Char)[] haystack, const(R2)[] needles, 59 CaseSensitive cs = CaseSensitive.yes) @safe pure 60 if (isSomeChar!Char && isSomeChar!R2 && 61 is(typeof(binaryFun!"a == b"(haystack.front, needles.front)))) 62 { 63 if (cs == CaseSensitive.yes) 64 { 65 foreach (ptrdiff_t i, dchar c; haystack) 66 { 67 foreach (dchar o; needles) 68 { 69 if (c != o) 70 { 71 return i; 72 } 73 } 74 } 75 } 76 else 77 { 78 foreach (ptrdiff_t i, dchar c; haystack) 79 { 80 dchar cLow = std.uni.toLower(c); 81 foreach (dchar o; needles) 82 { 83 if (cLow != o) 84 { 85 return i; 86 } 87 } 88 } 89 } 90 91 return -1; 92 } 93 94 ptrdiff_t indexOfAny(Char,R2)(const(Char)[] haystack, const(R2)[] needles, 95 CaseSensitive cs = CaseSensitive.yes) @safe pure 96 if (isSomeChar!Char && isSomeChar!R2 && 97 is(typeof(binaryFun!"a == b"(haystack.front, needles.front)))) 98 { 99 if (cs == CaseSensitive.yes) 100 { 101 foreach (ptrdiff_t i, dchar c; haystack) 102 { 103 foreach (dchar o; needles) 104 { 105 if (c == o) 106 { 107 return i; 108 } 109 } 110 } 111 } 112 else 113 { 114 foreach (ptrdiff_t i, dchar c; haystack) 115 { 116 dchar cLow = std.uni.toLower(c); 117 foreach (dchar o; needles) 118 { 119 if (cLow == o) 120 { 121 return i; 122 } 123 } 124 } 125 } 126 127 return -1; 128 } 129 130 unittest { 131 ptrdiff_t i = "helloWorld".indexOfAny("Wr"); 132 assert(i == 5); 133 i = "öällo world".indexOfAny("lo "); 134 assert(i == 4, to!string(i)); 135 } 136 137 ptrdiff_t indexOfAny(Char,R2)(const(Char)[] haystack, const(R2)[] needles, 138 const size_t startIdx, CaseSensitive cs = CaseSensitive.yes) @safe pure 139 if (isSomeChar!Char && isSomeChar!R2 && 140 is(typeof(binaryFun!"a == b"(haystack.front, needles.front)))) 141 { 142 if (startIdx < haystack.length) 143 { 144 ptrdiff_t foundIdx = indexOfAny(haystack[startIdx .. $], needles, cs); 145 if (foundIdx != -1) 146 { 147 return foundIdx + cast(ptrdiff_t)startIdx; 148 } 149 } 150 return -1; 151 } 152 153 void eatWhitespace(C)(ref C c) @safe pure { 154 static if(is(C == string)) { 155 c = c.strip(); 156 } else { 157 auto idx = stripLeftIdx(c); 158 if(idx == c.length) { 159 c = c[idx-1 .. $]; 160 } else { 161 c = c[idx .. $]; 162 } 163 } 164 } 165 166 unittest { 167 auto s = " foo"; 168 eatWhitespace(s); 169 assert(equal(s, "foo")); 170 } 171 172 string eatKey(C)(ref C c) @trusted pure { 173 eatWhitespace(c); 174 auto endOfKey = c.indexOf("="); 175 enforce(endOfKey != -1); 176 string name = c[0..endOfKey]; 177 c = c[endOfKey+1 .. $]; 178 179 return name.strip(); 180 } 181 182 unittest { 183 string input = " \tfoo = "; 184 auto n = eatKey(input); 185 assert(n == "foo", "\"" ~ n ~ "\""); 186 assert(input == "", "\"" ~ input ~ "\""); 187 } 188 189 string eatAttri(C)(ref C c) @trusted pure { 190 eatWhitespace(c); 191 auto firstTick = c.indexOfAny("\"'"); 192 string attri; 193 if(firstTick != -1) { 194 dchar foundString = c[firstTick]; 195 c = c[firstTick+1 .. $]; 196 197 size_t i = 0; 198 while(true) { 199 if(i > 0 && c[i] == foundString && c[i-1] != '\\') { 200 break; 201 } else if(i == 0 && c[i] == foundString) { 202 break; 203 } else { 204 ++i; 205 } 206 } 207 208 attri = c[0 .. i]; 209 c = c[i .. $]; 210 if(c[0] == foundString) { 211 c = c[1 .. $]; 212 } 213 eatWhitespace(c); 214 } else { 215 auto i = c.countUntil!(isNumber); 216 attri = c[0 .. i]; 217 c = c[i+1 .. $]; 218 eatWhitespace(c); 219 } 220 221 return attri; 222 } 223 224 unittest { 225 string input = " \"asdf\" "; 226 string attri = eatAttri(input); 227 assert(attri == "asdf", "\"" ~ attri ~ "\" " ~ input); 228 assert(input.empty, "\"" ~ input ~ "\""); 229 } 230 231 version(XML_AA) { 232 void insert(T,K,V)(ref T t, K k, V v) @trusted 233 if(isAssociativeArray!T) { 234 t.attributes[k] = v; 235 } 236 237 bool has(T,K)(ref T t, K k) @trusted nothrow 238 if(isAssociativeArray!T) { 239 return (k in t.attributes) !is null; 240 } 241 } else { 242 bool has(T,K)(ref T t, K k) @trusted nothrow { 243 return t.attributes.contains(k); 244 } 245 246 void insert(T,K,V)(ref T t, K k, V v) @trusted { 247 t.insert(k,v); 248 } 249 } 250 251 252 enum XmlTokenKind { 253 Invalid, 254 OpenClose, 255 Open, 256 Text, 257 Comment, 258 Type, 259 Close 260 } 261 262 struct XmlToken { 263 public: 264 this(string d, size_t l) { 265 this.data = d; 266 this.line = l; 267 if(this.data.length == 0) { 268 this.kind = XmlTokenKind.Invalid; 269 return; 270 } 271 this.kind = this.getKind(); 272 if(this.kind == XmlTokenKind.Open || this.kind == 273 XmlTokenKind.OpenClose || this.kind == XmlTokenKind.Close) { 274 this.readName(); 275 } 276 if(this.kind == XmlTokenKind.Open || this.kind == 277 XmlTokenKind.OpenClose || this.kind == XmlTokenKind.Close) { 278 this.readAttributes(); 279 } 280 } 281 282 ref string opIndex(string key) { 283 version(XML_AA) { 284 return this.attributes[key]; 285 } else { 286 return this.attributes[key].value; 287 } 288 } 289 290 string name; 291 XmlTokenKind kind = XmlTokenKind.Invalid; 292 version(XML_AA) { 293 string[string] attributes; 294 } else { 295 FashMap!(string,string) attributes; 296 } 297 string data; 298 size_t line; 299 300 private: 301 XmlTokenKind getKind() { 302 //import std.format : form = format; 303 //assert(this.data.length, form("no data at line %u", this.line)); 304 assert(this.data.length); 305 if(this.data[0] != '<') { 306 return XmlTokenKind.Text; 307 } else if(this.data[0] == '<') { 308 this.data.popFront(); 309 if(this.data[0] == '/') { 310 this.data.popFront(); 311 return XmlTokenKind.Close; 312 } else if(this.data[0] == '!') { 313 this.data.popFront(); 314 return XmlTokenKind.Comment; 315 } else if(this.data[0] == '?') { 316 this.data.popFront(); 317 return XmlTokenKind.Type; 318 } else if(this.data.length > 1 && this.data[$-2] == '/') { 319 return XmlTokenKind.OpenClose; 320 } else { 321 return XmlTokenKind.Open; 322 } 323 } 324 assert(false); 325 } 326 327 ptrdiff_t readNameBeginIdx() pure { 328 if(this.data.length > 0) { 329 return this.data.stripLeftIdx(); 330 } else { 331 return 0; 332 } 333 } 334 335 ptrdiff_t readNameEndIdx() pure { 336 auto lowIdx = readNameBeginIdx(); 337 return this.data[lowIdx .. $].indexOfAny(" >/")+lowIdx; 338 } 339 340 void readName() pure { 341 auto low = this.readNameBeginIdx(); 342 auto high = this.readNameEndIdx(); 343 assert(low <= this.data.length, this.data); 344 assert(high <= this.data.length, this.data); 345 if(low < high) { 346 this.name = this.data[low .. high]; 347 this.data = this.data[high .. $]; 348 } else if(!this.data.empty) { 349 this.data.popFront(); 350 } 351 } 352 353 void readAttributes() { 354 //import std.format : form = format; 355 while(!this.data.empty) { 356 eatWhitespace(this.data); 357 358 auto end = this.data.indexOf(">"); 359 if(end == 0) { 360 break; 361 } 362 end = this.data.indexOf("/>"); 363 if(end == 0) { 364 break; 365 } 366 367 eatWhitespace(this.data); 368 string key; 369 try { 370 key = eatKey(this.data); 371 } catch(Exception e) { 372 //assert(false, form("unable to read key at line %u", this.line)); 373 assert(false); 374 } 375 eatWhitespace(this.data); 376 string attri = eatAttri(this.data); 377 eatWhitespace(this.data); 378 version(XML_AA) { 379 this.attributes[key] = attri; 380 } else { 381 insert(this.attributes, key, attri); 382 } 383 } 384 } 385 386 //static auto re = ctRegex!("\\s*(\\w+)\\s*=\\s*\"(\\w+)\"\\s*"); 387 } 388 389 struct XmlTokenRange(InputRange) { 390 public: 391 @property InputRange input() { 392 return input_; 393 } 394 395 @property void input(InputRange i) { 396 input_ = i; 397 this.store_ = appender!string(); 398 this.readFromRange(); 399 } 400 401 @property auto front() { 402 return XmlToken(this.store_.data(), line); 403 } 404 405 @property void popFront() { 406 this.store_ = appender!string(); 407 readFromRange(); 408 } 409 410 @property bool empty() { 411 return this.store_.data().empty && std.array.empty(this.input_); 412 } 413 414 private: 415 size_t line; 416 417 void equalCrocos() { 418 dchar it; 419 dchar prev = '\0'; 420 size_t numCrocos = 0; 421 //foreach(it; this.input_) { 422 for(; !input_.empty(); input_.popFront()) { 423 it = input_.front(); 424 425 if(input_.front() == '\n' || input_.front() == '\r') { 426 ++this.line; 427 } 428 if(it == '<' && prev != '\\') { 429 ++numCrocos; 430 } else if(it == '>' && prev != '\\') { 431 --numCrocos; 432 } 433 434 prev = it; 435 436 if(!numCrocos) { 437 this.store_.put(it); 438 input_.popFront(); 439 break; 440 } 441 this.store_.put(it); 442 } 443 } 444 445 void eatTillCroco() { 446 dchar it; 447 dchar prev = '\0'; 448 for(; !input_.empty(); input_.popFront()) { 449 it = input_.front(); 450 if(input_.front() == '\n' || input_.front() == '\r') { 451 ++this.line; 452 } 453 if(it == '<' && prev != '\\') { 454 break; 455 } 456 this.store_.put(it); 457 prev = it; 458 } 459 } 460 461 void readFromRange() { 462 eatWhiteSpace(); 463 if(this.input_.empty) { 464 return; 465 } 466 467 if(this.input_.front == '<') { 468 equalCrocos(); 469 return; 470 } else { 471 eatTillCroco(); 472 return; 473 } 474 } 475 476 void eatWhiteSpace() { 477 while(!input_.empty && isWhite(std.array.front(input_))) { 478 if(input_.front() == '\n' || input_.front() == '\r') { 479 ++this.line; 480 } 481 input_.popFront(); 482 } 483 } 484 485 InputRange input_; 486 Appender!string store_; 487 } 488 489 auto xmlTokenRange(InputRange)(InputRange input) { 490 XmlTokenRange!InputRange ret; 491 ret.input = input; 492 return ret; 493 } 494 495 unittest { 496 auto s = "some fun string<>"; 497 auto r = xmlTokenRange(s); 498 499 auto f = r.front(); 500 assert(f.kind == XmlTokenKind.Text); 501 r.popFront(); 502 f = r.front(); 503 assert(f.kind == XmlTokenKind.Open, to!string(f.kind)); 504 } 505 506 unittest { 507 static assert(isInputRange!(XmlTokenRange!string)); 508 } 509 510 unittest { 511 string testString = "hello"; 512 auto r = xmlTokenRange(testString); 513 assert(r.front.data == "hello", r.front.data); 514 } 515 516 unittest { 517 string testString = "<hello>"; 518 auto r = xmlTokenRange(testString); 519 assert(r.front.name == "hello", r.front.name); 520 } 521 522 unittest { 523 string testString = "<hello/>"; 524 auto r = xmlTokenRange(testString); 525 assert(r.front.kind == XmlTokenKind.OpenClose); 526 assert(r.front.name == "hello", "\"" ~ r.front.name ~ "\""); 527 } 528 529 unittest { 530 string testString = "</hello>"; 531 auto r = xmlTokenRange(testString); 532 assert(r.front.kind == XmlTokenKind.Close); 533 assert(r.front.name == "hello", "\"" ~ r.front.name ~ "\""); 534 } 535 536 unittest { 537 string testString = "<hello>"; 538 string testString2 = "<hello>"; 539 auto test = testString ~ testString2; 540 auto r = xmlTokenRange(test); 541 } 542 543 unittest { 544 string testString = "<hello zzz=\"ttt\" world=\"foo\" args=\"bar\">"; 545 foreach(it; xmlTokenRange(testString)) { 546 foreach(key, value; it.attributes) { 547 //writefln("%s %s", key, value); 548 } 549 } 550 }