Improve this page Quickly fork, edit online, and submit a pull request for this page. Requires a signed-in GitHub account. This works well for small changes. If you'd like to make larger changes you may want to consider using local clone. Page wiki View or edit the community-maintained wiki page associated with this page.

std.d.lexer

This module contains a range-based lexer for the D programming language.

For performance reasons the lexer contained in this module operates only on UTF-8 encoded source code. If the use of other encodings is desired, the source code must be converted to UTF-8 before passing it to this lexer.

To use the lexer, create a LexerConfig struct. The LexerConfig contains fields for configuring the behavior of the lexer.
LexerConfig config;
config.iterStyle = IterationStyle.everything;
config.tokenStyle = TokenStyle.source;
config.versionNumber = 2064;
config.vendorString = "Lexer Example";
Once you have configured the lexer, call byToken() on your source code, passing in the configuration.
// UTF-8 encoded source code
auto source = "import std.stdio;"c;
auto tokens = byToken(source, config);
// or auto tokens = source.byToken(config);
The result of byToken() is a forward range of tokens that can be easily used with the algorithms from std.algorithm or iterated over with foreach.
assert (tokens.front.type == TokenType.import_);
assert (tokens.front.value == "import");
assert (tokens.front.line == 1);
assert (tokens.front.startIndex == 0);

Examples:
Generate HTML markup of D code.
module highlighter;

import std.stdio;
import std.array;
import std.d.lexer;

void writeSpan(string cssClass, string value)
{
    stdout.write(`<span class="`, cssClass, `">`, value.replace("&", "&amp;").replace("<", "&lt;"), `</span>`);
}

// http://ethanschoonover.com/solarized
void highlight(R)(R tokens)
{
    stdout.writeln(q"[<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
</head>
<body>
<style type="text/css">
html  { background-color: #fdf6e3; color: #002b36; }
.kwrd { color: #b58900; font-weight: bold;  }
.com  { color: #93a1a1; font-style: italic; }
.num  { color: #dc322f; font-weigth: bold;  }
.str  { color: #2aa198; font-style: italic; }
.op   { color: #586e75; font-weight: bold;  }
.type { color: #268bd2; font-weight: bold;  }
.cons { color: #859900; font-weight: bold;  }
</style>
<pre>]");

    foreach (Token t; tokens)
    {
        if (isBuiltType(t.type))
            writeSpan("type", t.value);
        else if (isKeyword(t.type))
            writeSpan("kwrd", t.value);
        else if (t.type == TokenType.comment)
            writeSpan("com", t.value);
        else if (isStringLiteral(t.type))
            writeSpan("str", t.value);
        else if (isNumberLiteral(t.type))
            writeSpan("num", t.value);
        else if (isOperator(t.type))
            writeSpan("op", t.value);
        else
            stdout.write(t.value.replace("<", "&lt;"));
    }
    stdout.writeln("</pre>\n</body></html>");
}

void main(string[] args)
{
    // Create the configuration
    LexerConfig config;
    // Specify that we want tokens to appear exactly as they did in the source
    config.tokenStyle = TokenStyle.source;
    // Include whitespace, comments, etc.
    config.iterStyle = IterationStyle.everything;
    // Tell the lexer to use the name of the file being read when generating
    // error messages.
    config.fileName = args[1];
    // Open the file (error checking ommitted for brevity)
    auto f = File(args[1]);
    // Read the lines of the file, and combine them. Then create the token
    // range, which is then passed on to highlight.
    (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight();
}

License:
License 1.0

Authors:
Brian Schott, Dmitry Olshansky

Source:
std/d/lexer.d

struct Token;
Represents a D token

string value;
The characters that comprise the token.

size_t startIndex;
The index of the start of the token in the original source. (measured in UTF-8 code units)

uint line;
The number of the line the token is on.

ushort column;
The column number of the start of the token in the original source. (measured in ASCII characters or UTF-8 code units)

TokenType type;
The token type.

const pure nothrow bool opEquals(ref const(Token) other);
Check to see if the token is of the same type and has the same string representation as the given token.

Examples:
Token a;
a.type = TokenType.intLiteral;
a.value = "1";
Token b;
b.type = TokenType.intLiteral;
b.value = "1";
assert (a == b);
b.value = "2";
assert (a != b);

const pure nothrow bool opEquals(string value);
Checks to see if the token's string representation is equal to the given string.

Examples:
Token t;
t.value = "abcde";
assert (t == "abcde");

const pure nothrow bool opEquals(TokenType type);
Checks to see if the token is of the given type.

Examples:
Token t;
t.type = TokenType.class_;
assert (t == TokenType.class_);

const pure nothrow int opCmp(ref const(Token) other);
Comparison operator orders tokens by start index.

Examples:
Token a;
a.startIndex = 10;
Token b;
b.startIndex = 20;
assert (a < b);

const pure nothrow int opCmp(size_t index);
Comparison operator overload for checking if the token's start index is before, after, or the same as the given index.

Examples:
import std.array;
import std.range;
auto source = cast(ubyte[]) "a b c"c;
LexerConfig c;
auto tokens = source.byToken(c).array();
assert (tokens.length == 3);
assert (tokens.assumeSorted().lowerBound(3)[1] == "b");
assert (!(tokens[1] < 2));

enum IterationStyle: ushort;
Configure the behavior of the byToken() function. These flags may be combined using a bitwise or.

codeOnly
Only include code, not whitespace or comments

includeComments
Include comment tokens

includeWhitespace
Include whitespace tokens

includeSpecialTokens
Include special tokens

ignoreEOF
Do not stop iteration on reaching the __EOF__ token

everything
Include everything. Equivalent to includeComments | includeWhitespace | ignoreEOF

enum TokenStyle: ushort;
Configuration of the token lexing style. These flags may be combined with a bitwise or.

default_
Escape sequences will be replaced with their equivalent characters, enclosing quote characters will not be included. Special tokens such as __VENDOR__ will be replaced with their equivalent strings. Useful for creating a compiler or interpreter.

notEscaped
Escape sequences will not be processed. An escaped quote character will not terminate string lexing, but it will not be replaced with the quote character in the token.

includeQuotes
Strings will include their opening and closing quote characters as well as any prefixes or suffixes (e.g.: "abcde"w will include the 'w' character as well as the opening and closing quotes)

doNotReplaceSpecial
Do not replace the value field of the special tokens such as __DATE__ with their string equivalents.

source
Strings will be read exactly as they appeared in the source, including their opening and closing quote characters. Useful for syntax highlighting.

struct LexerConfig;
Lexer configuration

IterationStyle iterStyle;
Configure the lexer's iteration style.

See Also:
IterationStyle

TokenStyle tokenStyle;
Configure the style of the tokens produced by the lexer.

See Also:
TokenStyle

uint versionNumber;
Replacement for the __VERSION__ token. Defaults to 100.

string vendorString;
Replacement for the __VENDOR__ token. Defaults to "std.d.lexer"

string fileName;
Name used when creating error messages that are sent to errorFunc. This is needed because the lexer operates on any forward range of ASCII characters or UTF-8 code units and does not know what to call its input source. Defaults to the empty string.

uint startLine;
ushort startColumn;
size_t startIndex;
The starting line and column numbers for the lexer. These can be set when partially lexing D code to provide correct token locations and better error messages. These should be left to their default values of 1 when lexing entire files. Line and column numbers are 1-indexed in this lexer because this produces more useful error messages. The start index is zero-indexed, as it is more useful to machines than users.

void delegate(string, size_t, uint, ushort, string) errorFunc;
This function is called when an error is encountered during lexing. If this field is not set, the lexer will throw an exception including the line, column, and error message.

Error Function Parameters:
string File name
size_t Code unit index
uint Line number
ushort Column number
string Error message

auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4 * 1024) if (isForwardRange!R && !isRandomAccessRange!R && is(ElementType!R : const(ubyte)));
auto byToken(R)(R range, LexerConfig config) if (isRandomAccessRange!R && is(ElementType!R : const(ubyte)));
Iterate over the given range of characters by D tokens.

The lexing process is able to handle a forward range of code units by using an internal circular buffer to provide efficient extracting of the token values from the input. It is more efficient, however, to provide a range that supports random accessing and slicing. If the input range supports slicing, the caching layer aliases itself away and the lexing process is much more efficient.

Parameters:
range the range of characters to lex
config the lexer configuration
bufferSize initial size of internal circular buffer

Returns:
a TokenRange that iterates over the given range

struct TokenRange(LexSrc);
Range of tokens. Use byToken() to instantiate.

const @property bool empty();
Returns:
true if the range is empty

const @property const(Token) front();
Returns:
the current token

Token moveFront();
Returns:
the current token and then removes it from the range

void popFront();
Removes the current token from the range

pure nothrow bool isOperator(const TokenType t);
pure nothrow bool isOperator(ref const Token t);
Returns:
true if the token is an operator

pure nothrow bool isKeyword(const TokenType t);
pure nothrow bool isKeyword(ref const Token t);
Returns:
true if the token is a keyword

pure nothrow bool isBasicType(const TokenType t);
pure nothrow bool isBasicType(ref const Token t);
Returns:
true if the token is a built-in type

pure nothrow bool isAttribute(const TokenType t);
pure nothrow bool isAttribute(ref const Token t);
Returns:
true if the token is an attribute

pure nothrow bool isProtection(const TokenType t);
pure nothrow bool isProtection(ref const Token t);
Returns:
true if the token is a protection attribute

pure nothrow bool isConstant(const TokenType t);
pure nothrow bool isConstant(ref const Token t);
Returns:
true if the token is a compile-time constant such as __DATE__

pure nothrow bool isLiteral(const TokenType t);
pure nothrow bool isLiteral(ref const Token t);
Returns:
true if the token is a string or number literal

pure nothrow bool isNumberLiteral(const TokenType t);
pure nothrow bool isNumberLiteral(ref const Token t);
Returns:
true if the token is a number literal

pure nothrow bool isStringLiteral(const TokenType t);
pure nothrow bool isStringLiteral(ref const Token t);
Returns:
true if the token is a string literal

pure nothrow bool isMisc(const TokenType t);
pure nothrow bool isMisc(ref const Token t);
Returns:
true if the token is whitespace, a comment, a special token sequence, or an identifier

enum TokenType: ushort;
Listing of all the tokens in the D language.

invalid
Not a valid token

assign
=

at
@

amp
&

bitAndAssign
&=

bitOr
|

bitOrAssign
|=

catAssign
~=

colon
:

comma
,

decrement
--

div
/

divAssign
/=

dollar
$

dot
.

equal
==

goesTo
=>

greater
>

greaterEqual
>=

hash
#

increment
++

lBrace
{

lBracket
[

less
<

lessEqual
<=

lessEqualGreater
<>=

lessOrGreater
<>

logicAnd
&&

logicOr
||

lParen
(

minus
-

minusAssign
-=

mod
%

modAssign
%=

mulAssign
*=

not
!

notEqual
!=

notGreater
!>

notGreaterEqual
!>=

notLess
!<

notLessEqual
!<=

notLessEqualGreater
!<>

plus
+

plusAssign
+=

pow
^^

powAssign
^^=

rBrace
}

rBracket
]

rParen
)

semicolon
;

shiftLeft
<<

shiftLeftAssign
<<=

shiftRight
>>

shiftRightAssign
>>=

dotdot
..

star
*

ternary
?

tilde
~

unordered
!<>=

unsignedShiftRight
>>>

unsignedShiftRightAssign
>>>=

vararg
...

xor
^

xorAssign
^=

bool_
bool

byte_
byte

cdouble_
cdouble

cent_
cent

cfloat_
cfloat

char_
char

creal_
creal

dchar_
dchar

double_
double

float_
float

idouble_
idouble

ifloat_
ifloat

int_
int

ireal_
ireal

long_
long

real_
real

short_
short

ubyte_
ubyte

ucent_
ucent

uint_
uint

ulong_
ulong

ushort_
ushort

void_
void

wchar_
wchar

align_
align

deprecated_
deprecated

extern_
extern

pragma_
pragma

export_
export

package_
package

private_
private

protected_
protected

public_
public

abstract_
abstract

auto_
auto

const_
const

final_
final

gshared
_gshared

immutable_
immutable

inout_
inout

scope_
scope

shared_
shared

static_
static

override_
override

pure_
pure

ref_
ref

synchronized_
synchronized

alias_
alias

asm_
asm

assert_
assert

body_
body

break_
break

case_
case

cast_
cast

catch_
catch

class_
class

continue_
continue

debug_
debug

default_
default

delegate_
delegate

function_
function

delete_
delete

do_
do

else_
else

enum_
enum

false_
false

finally_
finally

foreach_
foreach

foreach_reverse_
foreach_reverse

for_
for

goto_
goto

if_
if

import_
import

in_
in

interface_
interface

invariant_
invariant

is_
is

lazy_
lazy

macro_
macro

mixin_
mixin

module_
module

new_
new

nothrow_
nothrow

null_
null

out_
out

return_
return

struct_
struct

super_
super

switch_
switch

template_
template

this_
this

throw_
throw

true_
true

try_
try

typedef_
typedef

typeid_
typeid

typeof_
typeof

union_
union

unittest_
unittest

version_
version

volatile_
volatile

while_
while

traits
__traits

parameters
__parameters

vector
__vector

with_
with

specialDate
__DATE__

specialEof
__EOF__

specialTime
__TIME__

specialTimestamp
__TIMESTAMP__

specialVendor
__VENDOR__

specialVersion
__VERSION__

specialFile
__FILE__

specialLine
__LINE__

specialModule
__MODULE__

specialFunction
__FUNCTION__

specialPrettyFunction
__PRETTY_FUNCTION__

specialTokenSequence
#line 10 "file.d"

comment
/** comment */ or // comment or ///comment

identifier
anything else

scriptLine
Line at the beginning of source file that starts from #!

whitespace
whitespace

doubleLiteral
123.456

floatLiteral
123.456f or 0x123_45p-3

idoubleLiteral
123.456i

ifloatLiteral
123.456fi

intLiteral
123 or 0b1101010101

longLiteral
123L

realLiteral
123.456L

irealLiteral
123.456Li

uintLiteral
123u

ulongLiteral
123uL

characterLiteral
'a'

dstringLiteral
"32-bit string"d

stringLiteral
"an 8-bit string"

wstringLiteral
"16-bit string"w

pure string getTokenValue(const TokenType type);
Look up a token's string representation by its type.

Parameters:
TokenType type the token type

Returns:
a string representing the token, or null for token types such as identifier or integer literal whose string representations vary

Examples:
// The class token always has one value
assert (getTokenValue(TokenType.class_) == "class");
// Identifiers do not
assert (getTokenValue(TokenType.identifier) is null);