Ben Joffe |JavaScript Syntax Highlighter

JavaScript Syntax Highlighter

This is a JavaScript syntax highlighter experiment. You can enter JavaScript code into the textarea (down below) and you will be able to view the code with syntax highlighting.

It first builds a token list then sends that to a function to output the data. The tokeniser is designed to be as accurate as possible, for example the code ++ will be read as one operator token (increment), while +- will be read as two operator tokens (plus then minus). Although this is not strictly required for a syntax highlighter, it is implemented so that I may later extend this code to allow crushing/beautifying and/or actual parsing and executing. This level of accuracy does actually have benefits to syntax highlighting however, as tokens such as { and } can be coloured differently depending on whether they are block level indicators or object literals.

Every open source Syntax highlighter that I have tried fails on at least some valid JS. Common pitfalls include:

Failure to recognise numbers that start with a period, eg: .01
Failure to recognise that the second period in the following is an operator, and not part of the number: 0.1.method(); // yes this is valid JS
Failure to handle multiline strings
Interpreting the following as containing a regular expression: 1/2/3;
Ending regular expression tokens prematurely, eg: /reg[/]exp/; /[/*regexp*/]/;
Recognising some edge case 'divide' operators as regular expression opening tags, eg: /regexp/ /notRegexp/g; In this example the first line does not have a semicolon, so the first / on the next line immediately becomes a division operation, with notRegexp and g as variables.

I know of only one bug in my implementation: if an object literal is placed in the false branch of a tertiary statement then it is highlighted as a block level token (though most other libraries don't distinguish between these). Fixed!

Christian Krebbs writes in to inform of another bug, a regular expression following a variable declaration (without an assignment or semicolon) is interpreted as a division. This one is going to be particularly difficult to fix. If you find another code sequence that fails to highlight correctly please contact me.

// this is a prefilled testsuite, replace it with your own code

a = 1;

a = 0.1;

a = .1;

a = 0x;

a = 0xfF1;

a = 0XE;

a = 1e1;

a = 1E1;

a = 1e+1;

a = 1e-1;

a = 1.e1;

a = 0xe+1;

a = 1 .method();

a = 1..method();

a = 0.1.method();

a = .1.method();

a = 1e1.method();

a = 00.method();

a = 0x.method();

a = 0xF.method();

a = "x";

a = "x\
y";

"x";

a = 'x';

a = "x'y";

a = 'x"y';

a = "x\"y";

a = "x\\\"y";

a = "x\\";

a = /regexp/;

/regexp/;

/regexp/g;

/"regexp"/;

/reg[/]exp/;

/reg\/exp/;

/regexp//1;

/\/*regexp*/;

/[/*regexp*/]/;

!!/regexp/;

((/regexp/));

[[/regexp/]];

a = 1/2/3;

a = a/1/2;

a = a++/1/2;

a = this/1/2;

a = (1)/2/3;

a = []/1/2;

a = {}/1/2;

({}/1/2);

+{}/1/2;

{}/regexp/;

/regexp/
/1/2;

function(){}
/regexp/

do /regexp/.test('x') while(false);

true ? {} : {}/1/2;

a = a
/1/2;

var a
/regexp/; // this test fails

a = 1
/2/3;

a = 1//comment

a = 1/2// comment

a = 1 / 2; // comment

a = 1//comment
/2/3;

a = 1/*comment*//2;

a = 1+/*comment*/1/2;

a = 1/**//2;

/*/ comment /*/

/*/\*
comment
*/

// /* */ comment

a2 = 1;

π = 1;

\u0061 = 1;

a\u03c0 = 1;

a\u01Acπ = 1;

/*************************/
/* end of main testsuite */
/*************************/

if (true)
{
  a = {
    b: (function()
    {
      c = {
        d: {}
      };
    }()),
    x: function()
    {
      var b = 1 ? {} : {};
      a: {
        // labelled block
      }
      switch (true)
      {
        case 1 : yeah; break;
        case 2 : {
          blockStuff();
          a = {};
        }
        default : {
          blockStuff();
        }
      }
    },
    y: 2
  }
}

var a1 = "I like \"ponies\"\
they are fun.";
var a2 = 1/2/3 + 5e3; // funny number
var b = 0xfbe3D; /* multiline com..
...ment */50
alert(a+b);

var arr = [];
arr[0] = /[/]/;
arr[1] = (1)/2;
arr[2] = "1"/2;
arr[3] = /a//1;
arr[4] = /\//;
arr[5] = true/1;
arr[6] = false/1;
arr[7] = null/1;
arr[8] = this/1;
arr[9] = getReg();
arr[10] = {}/1;

{}/a/;
var a = {}/1;

function getReg()
{
  return /a/;
}

do /a/ while (!/a/);

a < b ; a >= b ; a === b;
a > b ; a <= b ; a !== b;
a + b ; a == b ; a >> b;
a - b ; a != b ; a << b;
a * b ; a += b ; a >>> b;
a / b ; a *= b ; a <<< b; // fake one '<<<' does not exist in JS
a % b ; a /= b ; a >>= b;
a & b ; a -= b ; a <<= b;
a | b ; a %= b ; a >>>= b;
a ^ b ; a &= b ;
a ++  ; a |= b ;
a --  ; a ^= b ;
a && b ;
a || b ;

if (b || b>=1 && b<=50)
{

}

str.replace(/\&/g, '&').replace(/</g, '<').replace(/\r\n|\r|\n/g,'<br>');

/foo/;  // a slash starting a line treated as a regexp beginning
"foo".match(/fo+$/);
// this line comment not treated as a regular expressions
"foo /bar/".test(/"baz"/);  // test string and regexp boundaries
var division = /\b\d+\/\d+/g;  // test char sets and escaping of specials
var allSpecials = /([^\[\]\{\}\-\?\+\*\.\^\$\/]+)\\/;
var slashInCharset = /[^/]/g, notCloseSq = /[^\]]/;

// test that slash used in numeric context treated as an operator
1 / 2;
1. / x;
x / y;
(x) / y;
1 /* foo */ / 2;
1 /* foo *// 2;
1/2;
1./x;
x/y;
(x)/y;

// test split over two lines.  line comment should not fool it
1//
/2;

x++/y;
x--/y;
x[y] / z;
f() / n;

// test that slash after non postfix operator is start of regexp
log('matches = ' + /foo/.test(foo));

// test keyword preceders
return /a regexp/;
division = notreturn / not_a_regexp / 2;  // keyword suffix does not match

Note: This is not designed to necessarily fail gracefully on invalid input.