Parsing BibTeX in JavaScript

Question

I have a project for which I need to parse BibTeX strings. In v0.2.9 and before, I used Regex, and I want to change that. I set up a while loop system that reads the file char by char, but it seems slow.

I wonder if there is a better way to use while loops, or to parse in general.

Note: I'll worry about escaped characters later, I was just wondering if this method is feasible. Tips for dealing with escaped characters are always welcome, but note that I haven't forgotten them.

function flipDel(str) {
  return str
    .replace(/{/g, '}')
    .split('')
    .reverse()
    .join('')
}

function getDel(stack, end_del) {
  return stack
    .slice(0, end_del.length ? end_del.length - 1 : 0)
    .reverse()
    .join('')
}

var str = '...', // Input string
    arr = []

;(function() {

  var stack = str.split(''),
    curs = '',
    nexr, obj

  function next(bool) {
    curs = stack.shift()
    nexr = stack.slice(0, 1)[0]
  }

  next()

  while (curs) { // Loop through entries

    // Add entry to JSON
    arr.push({
      type: '',
      label: '',
      props: {}
    })

    obj = arr[arr.length - 1]

    // BEGIN TOP ROW (e.g. "@book{authorYEARtitle,")

    while (curs !== '@')
      next()

    next()

    while (curs !== '{') {
      obj.type += curs
      next(true)
    }

    next()

    while (curs !== ',') {
      obj.label += curs
      next(true)
    }

    next()

    // END TOP ROW

    // Whitespace

    while (/^\s$/.test(curs))
      next()

    // BEGIN VALUE

    var key, start_del, end_del, nexs, val, bool_next = true

    // While there are values (check if there are at the end of this loop)
    while (bool_next) {

      bool_next = false,
        key = ''

      // Collect all non-whitespace chars in "key"

      while (curs !== '=') {
        if (!/^\s$/.test(curs)) {
          key += curs
          next(true)
        } else next()
      }

      next()

      while (/^\s$/.test(curs))
        next()

      start_del = ''

      // See what the value delimiter is

      while (/^["{]$/.test(curs)) {
        start_del += curs
        next()
      }

      val = ''

      end_del = flipDel(start_del),
        nexs = getDel(stack, end_del)

      // Collect all characters between the delimiter and the
      // flipped delimiter as "val" (value)
      // 
      // Had to perform a trick to for multiple-char delimiters and
      // no delimiters

      while (!start_del.length && !end_del.length ?
        (curs !== ',') :
        (curs + nexs !== end_del)
      ) {
        val += curs
        next(true)
        nexs = getDel(stack, end_del)
      }

      // End delimiter, comma and whitespace

      while (curs !== ',' && !/^\s$/.test(curs))
        next()

      while (/^\s$/.test(curs))
        next()

      while (curs === ',')
        next()

      while (/^\s$/.test(curs))
        next()

      // If the current char isn't the end of the entry, loop again

      if (curs !== '}')
        bool_next = true

      // Add key-value pair to collection

      obj.props[key] = val
    }

    // END VALUE

    next()

  }

  return;

})();

Here's a working example. There are two almost identical entries to add test cases without having to look up more examples.

$(function() {

  function flipDel(str) {
    return str
      .replace(/{/g, '}')
      .split('')
      .reverse()
      .join('')
  }

  function getDel(stack, end_del) {
    return stack
      .slice(0, end_del.length ? end_del.length - 1 : 0)
      .reverse()
      .join('')
  }

  var $ins = $('#ins'),
    $out = $('#out'),
    str = $ins.find('span').text(),
    arr = []

  ;
  (function() {

    var stack = str.split(''),
      curs = '',
      nexr, obj

    function next(bool) {
      curs = stack.shift()
      nexr = stack.slice(0, 1)[0]
    }

    next()

    while (curs) {

      arr.push({
        type: '',
        label: '',
        props: {}
      })
      obj = arr[arr.length - 1]

      // BEGIN TOP ROW

      while (curs !== '@')
        next()

      next()

      while (curs !== '{') {
        obj.type += curs
        next(true)
      }

      next()

      while (curs !== ',') {
        obj.label += curs
        next(true)
      }

      next()

      // END TOP ROW

      while (/^\s$/.test(curs))
        next()

      // BEGIN VALUE

      var key, start_del, end_del, nexs, val, bool_next = true

      while (bool_next) {

        bool_next = false,
          key = ''

        while (curs !== '=') {
          if (!/^\s$/.test(curs)) {
            key += curs
            next(true)
          } else next()
        }

        next()

        while (/^\s$/.test(curs))
          next()

        start_del = ''

        while (/^["{]$/.test(curs)) {
          start_del += curs
          next()
        }

        val = ''

        end_del = flipDel(start_del),
          nexs = getDel(stack, end_del)

        while (!start_del.length && !end_del.length ?
          (curs !== ',') :
          (curs + nexs !== end_del)
        ) {
          val += curs
          next(true)
          nexs = getDel(stack, end_del)
        }

        while (curs !== ',' && !/^\s$/.test(curs))
          next()

        while (/^\s$/.test(curs))
          next()

        while (curs === ',')
          next()

        while (/^\s$/.test(curs))
          next()

        if (curs !== '}')
          bool_next = true

        obj.props[key] = val
      }

      // END VALUE

      next()

    }

    $out.html(JSON.stringify(arr, null, 2))

    return;

  })();

})

<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<pre id="ins"><span>@article{Steinbeck2003,
  author = {Steinbeck, Christoph and Han, Yongquan and Kuhn, Stefan and Horlacher, Oliver and Luttmann, Edgar and Willighagen, Egon},
  year = {2003},
  title = {{The Chemistry Development Kit (CDK): an open-source Java library for Chemo- and Bioinformatics.}},
  journal = {Journal of chemical information and computer sciences},
  volume = {43},
  number = {2},
  pages = {493--500},
  doi = {10.1021/ci025584y},
  isbn = {2214707786},
  issn = {0095-2338},
  pmid = {12653513},
  url = {http://www.ncbi.nlm.nih.gov/pubmed/12653513},
}

@article{Steinbeck2003,
  author = {Steinbeck, Christoph and Han, Yongquan and Kuhn, Stefan and Horlacher, Oliver and Luttmann, Edgar and Willighagen, Egon},
  year = 2003,
  title = {{The Chemistry Development Kit (CDK): an open-source Java library for Chemo- and Bioinformatics.}},
  journal = {Journal of chemical information and computer sciences},
  volume = "43",
  number = {2},
  pages = {493--500},
  doi = "{10.1021/ci025584y}",
  isbn = {2214707786},
  issn = {0095-2338},
  pmid = {12653513},
  url = {http://www.ncbi.nlm.nih.gov/pubmed/12653513}
}</span></pre>
<hr>
<pre id="out"></pre>

Edit

I couldn't find the language settings, so a small explanation: "aanroepen" is calls, "(eigen) tijd" is (own) time, "gem." is average and "bestand" is file.

As wOxxOm predicted, it is mainly next() that is the problem. Suppose I replace all places next() is called with curs = stack[ ++index ] (and I declare var index = 0 somewhere at the top).

Are there better ways of parsing BibTeX, considering performance, coding practice, browser support, and likeliness to fail?

On escaped characters: My idea was to tokenize the stack with forward-looking Regex groups. Is that possible, and are there better ways?

Measure the performance in js profiler, otherwise we can only guess. I'd say it's slow because you modify the stack inside next() instead of using an index. — wOxxOm, Nov 1 at 22:36
@wOxxOm See edit. I hope this is what you meant. (It is in the first results of "js profiler", so I assume so). — LarsW, Nov 2 at 12:51

asked	3 days ago
viewed	28 times

current community

your communities

more stack exchange communities

Parsing BibTeX in JavaScript

Your Answer

Browse other questions tagged javascript parsing tex or ask your own question.

Hot Network Questions

current community

your communities

more stack exchange communities

Parsing BibTeX in JavaScript

Know someone who can answer? Share a link to this question via email, Google+, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Browse other questions tagged javascript parsing tex or ask your own question.

Related

Hot Network Questions