1 /** 2 * Copyright: © 2014 Economic Modeling Specialists, Intl. 3 * Authors: Brian Schott 4 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost License 1.0) 5 */ 6 module ddoc.lexer; 7 8 /** 9 * DDoc token types. 10 */ 11 enum Type : ubyte 12 { 13 lParen, /// $(LPAREN) 14 rParen, /// $(RPAREN) 15 dollar, /// $ 16 whitespace, /// whitespace 17 newline, /// newline 18 embedded, /// embedded D code 19 inlined, /// backtick-inlined code 20 comma, /// , 21 equals, /// = 22 header, /// section header 23 word, /// Anything else 24 } 25 26 /** 27 * DDoc token 28 */ 29 struct Token 30 { 31 string text; 32 Type type; 33 } 34 35 /** 36 * Lexer for DDoc comments. 37 */ 38 struct Lexer 39 { 40 /** 41 * Params: 42 * text = the _text to lex 43 */ 44 this(string text, bool skipHeader = false) 45 { 46 this.text = text; 47 this.parseHeader = !skipHeader; 48 popFront(); 49 } 50 51 bool empty() const @property 52 { 53 return _empty; 54 } 55 56 const(Token) front() const @property 57 { 58 return current; 59 } 60 61 void popFront() 62 { 63 import std.algorithm : startsWith; 64 import std.array : appender; 65 66 if (offset >= text.length) 67 _empty = true; 68 while (offset < text.length) switch (text[offset]) 69 { 70 case '`': 71 offset++; 72 immutable size_t inlineCode = inlineCodeIndex(); 73 if (inlineCode == size_t.max) 74 { 75 current.text = "`"; 76 current.type = Type.word; 77 } 78 else 79 { 80 current.text = text[offset .. inlineCode]; 81 current.type = Type.inlined; 82 offset = inlineCode + 1; 83 } 84 return; 85 case ',': 86 current.text = text[offset .. offset + 1]; 87 current.type = Type.comma; 88 offset++; 89 return; 90 case '=': 91 current.text = text[offset .. offset + 1]; 92 current.type = Type.equals; 93 offset++; 94 return; 95 case '$': 96 current.text = text[offset .. offset + 1]; 97 current.type = Type.dollar; 98 offset++; 99 return; 100 case '(': 101 current.text = text[offset .. offset + 1]; 102 current.type = Type.lParen; 103 offset++; 104 return; 105 case ')': 106 current.text = text[offset .. offset + 1]; 107 current.type = Type.rParen; 108 offset++; 109 return; 110 case '\r': 111 offset++; 112 goto case; 113 case '\n': 114 current.text = text[offset .. offset + 1]; 115 current.type = Type.newline; 116 offset++; 117 return; 118 case '-': 119 if (prevIsNewline(offset, text) && text[offset .. $].startsWith("---")) 120 { 121 current.type = Type.embedded; 122 // It's a string because user could mix spaces and tabs. 123 string indent = getIndent(offset, text); 124 // skip opening dashes 125 while (offset < text.length && text[offset] == '-') 126 offset++; 127 if (offset < text.length && text[offset] == '\r') 128 offset++; 129 if (offset < text.length && text[offset] == '\n') 130 { 131 offset++; 132 if (text.length > (offset + indent.length) 133 && text[offset .. offset + indent.length] == indent) 134 { 135 offset += indent.length; 136 } 137 } 138 // Loops until we find the closing '---'. 139 // Note that some more checking should be put into this to avoid 140 // accidentally matching '---' sequences. 141 // If 'indent' is 0, then we can just take a slice. However, in 142 // most cases, there will be some indent, and we need to remove it 143 // for the code to look nice. 144 size_t sliceBegin = offset; 145 auto app = appender!string; 146 while (true) 147 { 148 import std.conv:to; 149 if (offset >= text.length) 150 throw new DdocException("Unterminated code block\n" ~ text); 151 if (indent && text[offset] == '\n') 152 { 153 app.put(text[sliceBegin .. ++offset]); 154 sliceBegin = offset; 155 // We need to check if the indentation is the same 156 if (text[sliceBegin .. $].startsWith(indent)) 157 { 158 sliceBegin += indent.length; 159 offset += indent.length; 160 } 161 } 162 // Check for the end. 163 else if (text[offset] == '-' && prevIsNewline(offset, text) 164 && text[offset .. $].startsWith("---")) 165 { 166 if (indent.length == 0) 167 current.text = sliceBegin >= offset ? null : text[sliceBegin .. offset - 1]; 168 else 169 { 170 app.put(sliceBegin >= offset ? null : text[sliceBegin .. offset - 1]); 171 current.text = app.data; 172 } 173 // skip closing dashes 174 while (offset < text.length && text[offset] == '-') 175 offset++; 176 break; 177 } 178 else 179 offset++; 180 } 181 } 182 else 183 { 184 current.type = Type.word; 185 current.text = "-"; 186 offset++; 187 } 188 return; 189 case ' ': 190 case '\t': 191 size_t oldOffset = offset; 192 while (offset < text.length && (text[offset] == ' ' 193 || text[offset] == '\t')) 194 offset++; 195 current.type = Type.whitespace; 196 current.text = text[oldOffset .. offset]; 197 return; 198 default: 199 lexWord(); 200 return; 201 } 202 } 203 204 //private: 205 void lexWord() 206 { 207 import std.utf : decode; 208 import std.uni : isNumber, isAlpha; 209 210 size_t oldOffset = offset; 211 while (true) 212 { 213 text.decode(offset); 214 if (offset >= text.length) 215 break; 216 size_t o = offset; 217 dchar c = text.decode(o); 218 if (!(isAlpha(c) || isNumber(c)) && c != '_') 219 break; 220 } 221 current.type = Type.word; 222 current.text = text[oldOffset .. offset]; 223 if (parseHeader && prevIsNewline(oldOffset, text) && offset < text.length 224 && text[offset] == ':') 225 { 226 current.type = Type.header; 227 offset++; 228 } 229 } 230 231 size_t inlineCodeIndex() const 232 { 233 import std.algorithm : startsWith; 234 235 size_t o = offset; 236 while (o < text.length) 237 { 238 if (text[o .. $].startsWith("\r") || text[o .. $].startsWith("\n") 239 || text[o .. $].startsWith("\u2028") 240 || text[o .. $].startsWith("\u2029")) 241 { 242 return size_t.max; 243 } 244 else if (text[o] == '`') 245 return o; 246 else 247 o++; 248 } 249 return size_t.max; 250 } 251 252 Token current; 253 size_t offset; 254 string text; 255 bool _empty; 256 bool parseHeader; 257 } 258 259 unittest 260 { 261 import std.algorithm : map, equal; 262 import std.array:array; 263 264 auto expected = [Type.whitespace, Type.dollar, Type.lParen, Type.word, 265 Type.whitespace, Type.word, Type.comma, Type.whitespace, Type.word, 266 Type.rParen, Type.whitespace, Type.word, Type.whitespace, Type.word, 267 Type.newline, Type.embedded, Type.newline, Type.header, Type.newline, 268 Type.whitespace, Type.word, Type.whitespace, Type.equals, Type.whitespace, 269 Type.dollar, Type.lParen, Type.word, Type.whitespace, Type.word, 270 Type.rParen, Type.newline, Type.header, Type.newline, Type.whitespace, 271 Type.word, Type.whitespace, Type.word, Type.whitespace, Type.word]; 272 Lexer l = Lexer(` $(D something, else) is *a 273 ------------ 274 test 275 /** this is some test code */ 276 assert (whatever); 277 --------- 278 Params: 279 a = $(A param) 280 Returns: 281 nothing of consequence`c); 282 // foreach (t; l) 283 // writeln(t); 284 assert(equal(l.map!(a => a.type), expected)); 285 286 auto expectedTexts2 = ["inlined code", " ", "identifier"]; 287 auto expectedTypes2 = [Type.inlined, Type.whitespace, Type.word]; 288 Lexer l2 = Lexer("`inlined code` identifier"); 289 auto tokens = l2.array(); 290 assert (equal(tokens.map!(a => a.type), expectedTypes2)); 291 assert (equal(tokens.map!(a => a.text), expectedTexts2)); 292 } 293 294 /** 295 * Class for library exception. 296 * 297 * Most often, this is thrown when a Ddoc document is misformatted 298 * (unmatching parenthesis, too much arguments to a macro...). 299 */ 300 class DdocException : Exception 301 { 302 this(string msg, string file = __FILE__, size_t line = __LINE__, 303 Throwable next = null) nothrow pure @safe 304 { 305 super(msg, file, line, next); 306 } 307 308 // Allow method chaining: 309 // throw new DdocException().snippet(lexer.text); 310 @property DdocException snippet(string s) nothrow pure @safe @nogc 311 { 312 m_snippet = s; 313 return this; 314 } 315 316 @property string snippet() const nothrow pure @safe @nogc 317 { 318 return m_snippet; 319 } 320 321 private string m_snippet; 322 } 323 324 class DdocParseException : DdocException 325 { 326 this(string msg, string code, string file = __FILE__, size_t line = __LINE__, 327 Throwable next = null) nothrow pure @safe 328 { 329 super(msg, file, line, next); 330 this.snippet = code; 331 } 332 } 333 334 bool prevIsNewline(size_t offset, immutable string text) pure nothrow 335 { 336 if (offset == 0) 337 return true; 338 offset--; 339 while (offset > 0 && (text[offset] == ' ' || text[offset] == '\t')) 340 offset--; 341 return text[offset] == '\n'; 342 } 343 344 /// Return the indentation present before the given offset. 345 /// offset should point past the indentation. 346 /// e.g. : '\t\ttest' => Offset should be 2 (the index of 't'), 347 /// and getIndent will return '\t\t'. If offset is 1, 348 /// getIndent returns '\t'. 349 string getIndent(size_t offset, string text) pure nothrow 350 { 351 // If the offset is 0, or there's no indentation before. 352 if (offset < 1 || (text[offset - 1] != ' ' && text[offset - 1] != '\t')) 353 return null; 354 355 // At this point we already know that there's one level of indentation. 356 size_t indent = 1; 357 while (offset >= (indent + 1) // Avoid underflow 358 && (text[offset - indent - 1] == ' ' 359 || text[offset - indent - 1] == '\t')) 360 indent++; 361 return text[offset - indent .. offset]; 362 } 363 364 unittest 365 { 366 assert(" " == getIndent(1, " test")); 367 assert(" " == getIndent(2, " test")); 368 assert(!getIndent(3, " test")); 369 assert("\t \t" == getIndent(3, "\t \ttest")); 370 assert("\t " == getIndent(4, "\n\t test")); 371 }