I'm reading a behemoth of an XML file, line by line. Using htmlparser (v1.7.3), I'm trying to get the values of the attributes that will eventually be written to database.
Here's the code:
var fs = require('fs');
var sys = require('util');
var htmlparser = require('htmlparser');
function parseFoo(fileName) {
var stream = fs.createReadStream(fileName);
var buffer = "";
stream.addListener('data', function(data) {
buffer += data.toString();
var parts = buffer.split('\n');
parts.forEach(function(part, i) {
record.buildRecord(part);
})
buffer = parts.pop(); // Add remaining text to the buffer.
})
}
var record = (function() {
var handler = new htmlparser.DefaultHandler(function (error, dom) {});
var parser = new htmlparser.Parser(handler);
return {
'buildRecord': function(xml) {
parser.parseComplete(xml);
// Only for testing.
console.error(sys.inspect(handler.dom[0]['children'][0], false, null));
}
};
})();
parseFoo('foo.xml');
'foo.xml' (UTF-8 encoded) is trivial, it consists of gazillion of lines like this:
<w a="1" b="2" c="3" d="4">x</w>
When the program reaches a certain line (1204), I get an error:
D:\Geci\foo.js:25
console.error(sys.inspect(handler.dom[0]['children'][0], false, null));
TypeError: Cannot read property '0' of undefined
at Object.buildRecord (D:\Geci\foo.js:26:59)
at D:\Geci\foo.js:14:14
at Array.forEach (native)
at [object Object].<anonymous> (D:\Geci\foo.js:13:11)
at [object Object].emit (events.js:67:17)
at [object Object]._emitData (fs.js:1149:10)
at afterRead (fs.js:1131:10)
at Object.wrapper [as oncomplete] (fs.js:254:17)
I'm running this with Windows version of node.js (0.6.5.1).
Is it a bug in htmlparser or is it just me being a node.js newbie?
EDIT: this bug was solved by adding 'end' listener to stream
where I take special care of the last line. Plus moving parts.pop()
before the loop.
<w a="1" b="2" c="3" d="4">x</w>
but fails when I add 1204th line.<w a="1" b="2" c="3" d="4">x</w>
. ;) It's different in reality, of course. However, it fails in both cases.