1 /**
2  * Copyright: © 2014 Economic Modeling Specialists, Intl.
3  * Authors: Brian Schott
4  * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost License 1.0)
5  */
6 module ddoc.lexer;
7 
8 /**
9  * DDoc token types.
10  */
11 enum Type : ubyte
12 {
13 	lParen, /// $(LPAREN)
14 	rParen, /// $(RPAREN)
15 	dollar, /// $
16 	whitespace, /// whitespace
17 	newline, /// newline
18 	embedded, /// embedded D code
19 	inlined, /// backtick-inlined code
20 	comma, /// ,
21 	equals, /// =
22 	header, /// section header
23 	word, /// Anything else
24 }
25 
26 /**
27  * DDoc token
28  */
29 struct Token
30 {
31 	string text;
32 	Type type;
33 }
34 
35 /**
36  * Lexer for DDoc comments.
37  */
38 struct Lexer
39 {
40 	/**
41 	 * Params:
42 	 *     text = the _text to lex
43 	 */
44 	this(string text, bool skipHeader = false)
45 	{
46 		this.text = text;
47 		this.parseHeader = !skipHeader;
48 		popFront();
49 	}
50 
51 	bool empty() const @property
52 	{
53 		return _empty;
54 	}
55 
56 	const(Token) front() const @property
57 	{
58 		return current;
59 	}
60 
61 	void popFront()
62 	{
63 		import std.algorithm : startsWith;
64 		import std.array : appender;
65 
66 		if (offset >= text.length)
67 			_empty = true;
68 		while (offset < text.length) switch (text[offset])
69 		{
70 		case '`':
71 			offset++;
72 			immutable size_t inlineCode = inlineCodeIndex();
73 			if (inlineCode == size_t.max)
74 			{
75 				current.text = "`";
76 				current.type = Type.word;
77 			}
78 			else
79 			{
80 				current.text = text[offset .. inlineCode];
81 				current.type = Type.inlined;
82 				offset = inlineCode + 1;
83 			}
84 			return;
85 		case ',':
86 			current.text = text[offset .. offset + 1];
87 			current.type = Type.comma;
88 			offset++;
89 			return;
90 		case '=':
91 			current.text = text[offset .. offset + 1];
92 			current.type = Type.equals;
93 			offset++;
94 			return;
95 		case '$':
96 			current.text = text[offset .. offset + 1];
97 			current.type = Type.dollar;
98 			offset++;
99 			return;
100 		case '(':
101 			current.text = text[offset .. offset + 1];
102 			current.type = Type.lParen;
103 			offset++;
104 			return;
105 		case ')':
106 			current.text = text[offset .. offset + 1];
107 			current.type = Type.rParen;
108 			offset++;
109 			return;
110 		case '\r':
111 			offset++;
112 			goto case;
113 		case '\n':
114 			current.text = text[offset .. offset + 1];
115 			current.type = Type.newline;
116 			offset++;
117 			return;
118 		case '-':
119 			if (prevIsNewline(offset, text) && text[offset .. $].startsWith("---"))
120 			{
121 				current.type = Type.embedded;
122 				// It's a string because user could mix spaces and tabs.
123 				string indent = getIndent(offset, text);
124 				// skip opening dashes
125 				while (offset < text.length && text[offset] == '-')
126 					offset++;
127 				if (offset < text.length && text[offset] == '\r')
128 					offset++;
129 				if (offset < text.length && text[offset] == '\n')
130 				{
131 					offset++;
132 					if (text.length > (offset + indent.length)
133 					    	&& text[offset .. offset + indent.length] == indent)
134 					{
135 						offset += indent.length;
136 					}
137 				}
138 				// Loops until we find the closing '---'.
139 				// Note that some more checking should be put into this to avoid
140 				// accidentally matching '---' sequences.
141 				// If 'indent' is 0, then we can just take a slice. However, in
142 				// most cases, there will be some indent, and we need to remove it
143 				// for the code to look nice.
144 				size_t sliceBegin = offset;
145 				auto app = appender!string;
146 				while (true)
147 				{
148 					import std.conv:to;
149 					if (offset >= text.length)
150 						throw new DdocException("Unterminated code block\n" ~ text);
151 					if (indent && text[offset] == '\n')
152 					{
153 						app.put(text[sliceBegin .. ++offset]);
154 						sliceBegin = offset;
155 						// We need to check if the indentation is the same
156 						if (text[sliceBegin .. $].startsWith(indent))
157 						{
158 							sliceBegin += indent.length;
159 							offset += indent.length;
160 						}
161 					}
162 					// Check for the end.
163 					else if (text[offset] == '-' && prevIsNewline(offset, text)
164 					    && text[offset .. $].startsWith("---"))
165 					{
166 						if (indent.length == 0)
167 							current.text = sliceBegin >= offset ? null : text[sliceBegin .. offset - 1];
168 						else
169 						{
170 							app.put(sliceBegin >= offset ? null : text[sliceBegin .. offset - 1]);
171 							current.text = app.data;
172 						}
173 						// skip closing dashes
174 						while (offset < text.length && text[offset] == '-')
175 							offset++;
176 						break;
177 					}
178 					else
179 						offset++;
180 				}
181 			}
182 			else
183 			{
184 				current.type = Type.word;
185 				current.text = "-";
186 				offset++;
187 			}
188 			return;
189 		case ' ':
190 		case '\t':
191 			size_t oldOffset = offset;
192 			while (offset < text.length && (text[offset] == ' '
193 				|| text[offset] == '\t'))
194 				offset++;
195 			current.type = Type.whitespace;
196 			current.text = text[oldOffset .. offset];
197 			return;
198 		default:
199 			lexWord();
200 			return;
201 		}
202 	}
203 
204 	//private:
205 	void lexWord()
206 	{
207 		import std.utf : decode;
208 		import std.uni : isNumber, isAlpha;
209 
210 		size_t oldOffset = offset;
211 		while (true)
212 		{
213 			text.decode(offset);
214 			if (offset >= text.length)
215 				break;
216 			size_t o = offset;
217 			dchar c = text.decode(o);
218 			if (!(isAlpha(c) || isNumber(c)) && c != '_')
219 				break;
220 		}
221 		current.type = Type.word;
222 		current.text = text[oldOffset .. offset];
223 		if (parseHeader && prevIsNewline(oldOffset, text) && offset < text.length
224 			&& text[offset] == ':')
225 		{
226 			current.type = Type.header;
227 			offset++;
228 		}
229 	}
230 
231 	size_t inlineCodeIndex() const
232 	{
233 		import std.algorithm : startsWith;
234 
235 		size_t o = offset;
236 		while (o < text.length)
237 		{
238 			if (text[o .. $].startsWith("\r") || text[o .. $].startsWith("\n")
239 				|| text[o .. $].startsWith("\u2028")
240 				|| text[o .. $].startsWith("\u2029"))
241 			{
242 				return size_t.max;
243 			}
244 			else if (text[o] == '`')
245 				return o;
246 			else
247 				o++;
248 		}
249 		return size_t.max;
250 	}
251 
252 	Token current;
253 	size_t offset;
254 	string text;
255 	bool _empty;
256 	bool parseHeader;
257 }
258 
259 unittest
260 {
261 	import std.algorithm : map, equal;
262 	import std.array:array;
263 
264 	auto expected = [Type.whitespace, Type.dollar, Type.lParen, Type.word,
265 		Type.whitespace, Type.word, Type.comma, Type.whitespace, Type.word,
266 		Type.rParen, Type.whitespace, Type.word, Type.whitespace, Type.word,
267 		Type.newline, Type.embedded, Type.newline, Type.header, Type.newline,
268 		Type.whitespace, Type.word, Type.whitespace, Type.equals, Type.whitespace,
269 		Type.dollar, Type.lParen, Type.word, Type.whitespace, Type.word,
270 		Type.rParen, Type.newline, Type.header, Type.newline, Type.whitespace,
271 		Type.word, Type.whitespace, Type.word, Type.whitespace, Type.word];
272 	Lexer l = Lexer(` $(D something, else) is *a
273 ------------
274 test
275 /** this is some test code */
276 assert (whatever);
277 ---------
278 Params:
279 	a = $(A param)
280 Returns:
281 	nothing of consequence`c);
282 	//	foreach (t; l)
283 	//		writeln(t);
284 	assert(equal(l.map!(a => a.type), expected));
285 
286 	auto expectedTexts2 = ["inlined code", " ", "identifier"];
287 	auto expectedTypes2 = [Type.inlined, Type.whitespace, Type.word];
288 	Lexer l2 = Lexer("`inlined code` identifier");
289 	auto tokens = l2.array();
290 	assert (equal(tokens.map!(a => a.type), expectedTypes2));
291 	assert (equal(tokens.map!(a => a.text), expectedTexts2));
292 }
293 
294 /**
295  * Class for library exception.
296  *
297  * Most often, this is thrown when a Ddoc document is misformatted
298  * (unmatching parenthesis, too much arguments to a macro...).
299  */
300 class DdocException : Exception
301 {
302 	this(string msg, string file = __FILE__, size_t line = __LINE__,
303 		Throwable next = null) nothrow pure @safe
304 	{
305 		super(msg, file, line, next);
306 	}
307 
308 	// Allow method chaining:
309 	// throw new DdocException().snippet(lexer.text);
310 	@property DdocException snippet(string s) nothrow pure @safe @nogc
311 	{
312 		m_snippet = s;
313 		return this;
314 	}
315 
316 	@property string snippet() const nothrow pure @safe @nogc
317 	{
318 		return m_snippet;
319 	}
320 
321 	private string m_snippet;
322 }
323 
324 class DdocParseException : DdocException
325 {
326 	this(string msg, string code, string file = __FILE__, size_t line = __LINE__,
327 		Throwable next = null) nothrow pure @safe
328 	{
329 		super(msg, file, line, next);
330 		this.snippet = code;
331 	}
332 }
333 
334 bool prevIsNewline(size_t offset, immutable string text) pure nothrow
335 {
336 	if (offset == 0)
337 		return true;
338 	offset--;
339 	while (offset > 0 && (text[offset] == ' ' || text[offset] == '\t'))
340 		offset--;
341 	return text[offset] == '\n';
342 }
343 
344 /// Return the indentation present before the given offset.
345 /// offset should point past the indentation.
346 /// e.g. : '\t\ttest' => Offset should be 2 (the index of 't'),
347 ///        and getIndent will return '\t\t'. If offset is 1,
348 ///        getIndent returns '\t'.
349 string getIndent(size_t offset, string text) pure nothrow
350 {
351 	// If the offset is 0, or there's no indentation before.
352 	if (offset < 1 || (text[offset - 1] != ' ' && text[offset - 1] != '\t'))
353 		return null;
354 
355 	// At this point we already know that there's one level of indentation.
356 	size_t indent = 1;
357 	while (offset >= (indent + 1) // Avoid underflow
358 	       && (text[offset - indent - 1] == ' '
359 		   || text[offset - indent - 1] == '\t'))
360 		indent++;
361 	return text[offset - indent .. offset];
362 }
363 
364 unittest
365 {
366 	assert(" " == getIndent(1, "  test"));
367 	assert("  " == getIndent(2, "  test"));
368 	assert(!getIndent(3, "  test"));
369 	assert("\t \t" == getIndent(3, "\t \ttest"));
370 	assert("\t  " == getIndent(4, "\n\t  test"));
371 }