I have a project for which I need to parse BibTeX strings. In v0.2.9
and before, I used Regex, and I want to change that. I set up a while loop system that reads the file char by char, but it seems slow.
I wonder if there is a better way to use while loops, or to parse in general.
Note: I'll worry about escaped characters later, I was just wondering if this method is feasible. Tips for dealing with escaped characters are always welcome, but note that I haven't forgotten them.
function flipDel(str) {
return str
.replace(/{/g, '}')
.split('')
.reverse()
.join('')
}
function getDel(stack, end_del) {
return stack
.slice(0, end_del.length ? end_del.length - 1 : 0)
.reverse()
.join('')
}
var str = '...', // Input string
arr = []
;(function() {
var stack = str.split(''),
curs = '',
nexr, obj
function next(bool) {
curs = stack.shift()
nexr = stack.slice(0, 1)[0]
}
next()
while (curs) { // Loop through entries
// Add entry to JSON
arr.push({
type: '',
label: '',
props: {}
})
obj = arr[arr.length - 1]
// BEGIN TOP ROW (e.g. "@book{authorYEARtitle,")
while (curs !== '@')
next()
next()
while (curs !== '{') {
obj.type += curs
next(true)
}
next()
while (curs !== ',') {
obj.label += curs
next(true)
}
next()
// END TOP ROW
// Whitespace
while (/^\s$/.test(curs))
next()
// BEGIN VALUE
var key, start_del, end_del, nexs, val, bool_next = true
// While there are values (check if there are at the end of this loop)
while (bool_next) {
bool_next = false,
key = ''
// Collect all non-whitespace chars in "key"
while (curs !== '=') {
if (!/^\s$/.test(curs)) {
key += curs
next(true)
} else next()
}
next()
while (/^\s$/.test(curs))
next()
start_del = ''
// See what the value delimiter is
while (/^["{]$/.test(curs)) {
start_del += curs
next()
}
val = ''
end_del = flipDel(start_del),
nexs = getDel(stack, end_del)
// Collect all characters between the delimiter and the
// flipped delimiter as "val" (value)
//
// Had to perform a trick to for multiple-char delimiters and
// no delimiters
while (!start_del.length && !end_del.length ?
(curs !== ',') :
(curs + nexs !== end_del)
) {
val += curs
next(true)
nexs = getDel(stack, end_del)
}
// End delimiter, comma and whitespace
while (curs !== ',' && !/^\s$/.test(curs))
next()
while (/^\s$/.test(curs))
next()
while (curs === ',')
next()
while (/^\s$/.test(curs))
next()
// If the current char isn't the end of the entry, loop again
if (curs !== '}')
bool_next = true
// Add key-value pair to collection
obj.props[key] = val
}
// END VALUE
next()
}
return;
})();
Here's a working example. There are two almost identical entries to add test cases without having to look up more examples.
$(function() {
function flipDel(str) {
return str
.replace(/{/g, '}')
.split('')
.reverse()
.join('')
}
function getDel(stack, end_del) {
return stack
.slice(0, end_del.length ? end_del.length - 1 : 0)
.reverse()
.join('')
}
var $ins = $('#ins'),
$out = $('#out'),
str = $ins.find('span').text(),
arr = []
;
(function() {
var stack = str.split(''),
curs = '',
nexr, obj
function next(bool) {
curs = stack.shift()
nexr = stack.slice(0, 1)[0]
}
next()
while (curs) {
arr.push({
type: '',
label: '',
props: {}
})
obj = arr[arr.length - 1]
// BEGIN TOP ROW
while (curs !== '@')
next()
next()
while (curs !== '{') {
obj.type += curs
next(true)
}
next()
while (curs !== ',') {
obj.label += curs
next(true)
}
next()
// END TOP ROW
while (/^\s$/.test(curs))
next()
// BEGIN VALUE
var key, start_del, end_del, nexs, val, bool_next = true
while (bool_next) {
bool_next = false,
key = ''
while (curs !== '=') {
if (!/^\s$/.test(curs)) {
key += curs
next(true)
} else next()
}
next()
while (/^\s$/.test(curs))
next()
start_del = ''
while (/^["{]$/.test(curs)) {
start_del += curs
next()
}
val = ''
end_del = flipDel(start_del),
nexs = getDel(stack, end_del)
while (!start_del.length && !end_del.length ?
(curs !== ',') :
(curs + nexs !== end_del)
) {
val += curs
next(true)
nexs = getDel(stack, end_del)
}
while (curs !== ',' && !/^\s$/.test(curs))
next()
while (/^\s$/.test(curs))
next()
while (curs === ',')
next()
while (/^\s$/.test(curs))
next()
if (curs !== '}')
bool_next = true
obj.props[key] = val
}
// END VALUE
next()
}
$out.html(JSON.stringify(arr, null, 2))
return;
})();
})
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<pre id="ins"><span>@article{Steinbeck2003,
author = {Steinbeck, Christoph and Han, Yongquan and Kuhn, Stefan and Horlacher, Oliver and Luttmann, Edgar and Willighagen, Egon},
year = {2003},
title = {{The Chemistry Development Kit (CDK): an open-source Java library for Chemo- and Bioinformatics.}},
journal = {Journal of chemical information and computer sciences},
volume = {43},
number = {2},
pages = {493--500},
doi = {10.1021/ci025584y},
isbn = {2214707786},
issn = {0095-2338},
pmid = {12653513},
url = {http://www.ncbi.nlm.nih.gov/pubmed/12653513},
}
@article{Steinbeck2003,
author = {Steinbeck, Christoph and Han, Yongquan and Kuhn, Stefan and Horlacher, Oliver and Luttmann, Edgar and Willighagen, Egon},
year = 2003,
title = {{The Chemistry Development Kit (CDK): an open-source Java library for Chemo- and Bioinformatics.}},
journal = {Journal of chemical information and computer sciences},
volume = "43",
number = {2},
pages = {493--500},
doi = "{10.1021/ci025584y}",
isbn = {2214707786},
issn = {0095-2338},
pmid = {12653513},
url = {http://www.ncbi.nlm.nih.gov/pubmed/12653513}
}</span></pre>
<hr>
<pre id="out"></pre>
Edit
I couldn't find the language settings, so a small explanation: "aanroepen" is calls, "(eigen) tijd" is (own) time, "gem." is average and "bestand" is file.
As wOxxOm predicted, it is mainly next()
that is the problem. Suppose I replace all places next()
is called with curs = stack[ ++index ]
(and I declare var index = 0
somewhere at the top).
Are there better ways of parsing BibTeX, considering performance, coding practice, browser support, and likeliness to fail?
On escaped characters: My idea was to tokenize the stack with forward-looking Regex groups. Is that possible, and are there better ways?