Code Review Stack Exchange is a question and answer site for peer programmer code reviews. Join them; it only takes a minute:

Sign up
Here's how it works:
  1. Anybody can ask a question
  2. Anybody can answer
  3. The best answers are voted up and rise to the top

I have a project for which I need to parse BibTeX strings. In v0.2.9 and before, I used Regex, and I want to change that. I set up a while loop system that reads the file char by char, but it seems slow.

I wonder if there is a better way to use while loops, or to parse in general.

Note: I'll worry about escaped characters later, I was just wondering if this method is feasible. Tips for dealing with escaped characters are always welcome, but note that I haven't forgotten them.

function flipDel(str) {
  return str
    .replace(/{/g, '}')
    .split('')
    .reverse()
    .join('')
}

function getDel(stack, end_del) {
  return stack
    .slice(0, end_del.length ? end_del.length - 1 : 0)
    .reverse()
    .join('')
}

var str = '...', // Input string
    arr = []

;(function() {

  var stack = str.split(''),
    curs = '',
    nexr, obj

  function next(bool) {
    curs = stack.shift()
    nexr = stack.slice(0, 1)[0]
  }

  next()

  while (curs) { // Loop through entries

    // Add entry to JSON
    arr.push({
      type: '',
      label: '',
      props: {}
    })

    obj = arr[arr.length - 1]

    // BEGIN TOP ROW (e.g. "@book{authorYEARtitle,")

    while (curs !== '@')
      next()

    next()

    while (curs !== '{') {
      obj.type += curs
      next(true)
    }

    next()

    while (curs !== ',') {
      obj.label += curs
      next(true)
    }

    next()

    // END TOP ROW

    // Whitespace

    while (/^\s$/.test(curs))
      next()

    // BEGIN VALUE

    var key, start_del, end_del, nexs, val, bool_next = true

    // While there are values (check if there are at the end of this loop)
    while (bool_next) {

      bool_next = false,
        key = ''

      // Collect all non-whitespace chars in "key"

      while (curs !== '=') {
        if (!/^\s$/.test(curs)) {
          key += curs
          next(true)
        } else next()
      }

      next()

      while (/^\s$/.test(curs))
        next()

      start_del = ''

      // See what the value delimiter is

      while (/^["{]$/.test(curs)) {
        start_del += curs
        next()
      }

      val = ''

      end_del = flipDel(start_del),
        nexs = getDel(stack, end_del)

      // Collect all characters between the delimiter and the
      // flipped delimiter as "val" (value)
      // 
      // Had to perform a trick to for multiple-char delimiters and
      // no delimiters

      while (!start_del.length && !end_del.length ?
        (curs !== ',') :
        (curs + nexs !== end_del)
      ) {
        val += curs
        next(true)
        nexs = getDel(stack, end_del)
      }

      // End delimiter, comma and whitespace

      while (curs !== ',' && !/^\s$/.test(curs))
        next()

      while (/^\s$/.test(curs))
        next()

      while (curs === ',')
        next()

      while (/^\s$/.test(curs))
        next()

      // If the current char isn't the end of the entry, loop again

      if (curs !== '}')
        bool_next = true

      // Add key-value pair to collection

      obj.props[key] = val
    }

    // END VALUE

    next()

  }

  return;

})();

Here's a working example. There are two almost identical entries to add test cases without having to look up more examples.

$(function() {

  function flipDel(str) {
    return str
      .replace(/{/g, '}')
      .split('')
      .reverse()
      .join('')
  }

  function getDel(stack, end_del) {
    return stack
      .slice(0, end_del.length ? end_del.length - 1 : 0)
      .reverse()
      .join('')
  }

  var $ins = $('#ins'),
    $out = $('#out'),
    str = $ins.find('span').text(),
    arr = []

  ;
  (function() {

    var stack = str.split(''),
      curs = '',
      nexr, obj

    function next(bool) {
      curs = stack.shift()
      nexr = stack.slice(0, 1)[0]
    }

    next()

    while (curs) {

      arr.push({
        type: '',
        label: '',
        props: {}
      })
      obj = arr[arr.length - 1]

      // BEGIN TOP ROW

      while (curs !== '@')
        next()

      next()

      while (curs !== '{') {
        obj.type += curs
        next(true)
      }

      next()

      while (curs !== ',') {
        obj.label += curs
        next(true)
      }

      next()

      // END TOP ROW

      while (/^\s$/.test(curs))
        next()

      // BEGIN VALUE

      var key, start_del, end_del, nexs, val, bool_next = true

      while (bool_next) {

        bool_next = false,
          key = ''

        while (curs !== '=') {
          if (!/^\s$/.test(curs)) {
            key += curs
            next(true)
          } else next()
        }

        next()

        while (/^\s$/.test(curs))
          next()

        start_del = ''

        while (/^["{]$/.test(curs)) {
          start_del += curs
          next()
        }

        val = ''

        end_del = flipDel(start_del),
          nexs = getDel(stack, end_del)

        while (!start_del.length && !end_del.length ?
          (curs !== ',') :
          (curs + nexs !== end_del)
        ) {
          val += curs
          next(true)
          nexs = getDel(stack, end_del)
        }

        while (curs !== ',' && !/^\s$/.test(curs))
          next()

        while (/^\s$/.test(curs))
          next()

        while (curs === ',')
          next()

        while (/^\s$/.test(curs))
          next()

        if (curs !== '}')
          bool_next = true

        obj.props[key] = val
      }

      // END VALUE

      next()

    }

    $out.html(JSON.stringify(arr, null, 2))

    return;

  })();

})
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<pre id="ins"><span>@article{Steinbeck2003,
  author = {Steinbeck, Christoph and Han, Yongquan and Kuhn, Stefan and Horlacher, Oliver and Luttmann, Edgar and Willighagen, Egon},
  year = {2003},
  title = {{The Chemistry Development Kit (CDK): an open-source Java library for Chemo- and Bioinformatics.}},
  journal = {Journal of chemical information and computer sciences},
  volume = {43},
  number = {2},
  pages = {493--500},
  doi = {10.1021/ci025584y},
  isbn = {2214707786},
  issn = {0095-2338},
  pmid = {12653513},
  url = {http://www.ncbi.nlm.nih.gov/pubmed/12653513},
}

@article{Steinbeck2003,
  author = {Steinbeck, Christoph and Han, Yongquan and Kuhn, Stefan and Horlacher, Oliver and Luttmann, Edgar and Willighagen, Egon},
  year = 2003,
  title = {{The Chemistry Development Kit (CDK): an open-source Java library for Chemo- and Bioinformatics.}},
  journal = {Journal of chemical information and computer sciences},
  volume = "43",
  number = {2},
  pages = {493--500},
  doi = "{10.1021/ci025584y}",
  isbn = {2214707786},
  issn = {0095-2338},
  pmid = {12653513},
  url = {http://www.ncbi.nlm.nih.gov/pubmed/12653513}
}</span></pre>
<hr>
<pre id="out"></pre>

Edit


profile

I couldn't find the language settings, so a small explanation: "aanroepen" is calls, "(eigen) tijd" is (own) time, "gem." is average and "bestand" is file.

As wOxxOm predicted, it is mainly next() that is the problem. Suppose I replace all places next() is called with curs = stack[ ++index ] (and I declare var index = 0 somewhere at the top).

Are there better ways of parsing BibTeX, considering performance, coding practice, browser support, and likeliness to fail?

On escaped characters: My idea was to tokenize the stack with forward-looking Regex groups. Is that possible, and are there better ways?

share|improve this question
    
Measure the performance in js profiler, otherwise we can only guess. I'd say it's slow because you modify the stack inside next() instead of using an index. – wOxxOm Nov 1 at 22:36
    
@wOxxOm See edit. I hope this is what you meant. (It is in the first results of "js profiler", so I assume so). – LarsW Nov 2 at 12:51

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Browse other questions tagged or ask your own question.