Hello I have been working on the following code and unstructured content with cheerio.js for the past few hours.
So far I have not been successful and would really appreciate your help.
I am iterating over the source content to generate an array only that the output array is not in the proper format.
Here is the source HTML.
var cheerio = require('cheerio');
var htmlContent2 = '<div id="header1"><p class="headpara">Header content</p> <p class="elem"><b>a. </b>Lorem dolor sit amet, consectetur:</p><p class="elem"><b>1. </b>Perferendis iure doloremque iusto facilis.</p><p class="elem"><b>2. </b>Asperiores impedit officiis cumque molestias at rerum !</p><p class="elem"><b>b. </b>More dummy text.</p> <p class="elem"><b>1. </b>Additional dummy text: </p> <p class="elem"><b>(a).</b>Asperiores impedit officiis.</p> <p class="elem"><b>(b).</b>Lolestiae asperiores ad repellat est obcaecati.</p> <p class="elem"><b>2. </b>Lorem ipsum dolor sit amet 1.</p> <p class="elem"><b>3.</b>Lorem ipsum dolor sit amet 2. </p> <p class="notelem">Dignissimos maiores facere consequuntur quod.</p><p class="notelem"> maiores facere consequuntur quod.</p> <p class="elem"><b>c. </b>Ea consectetur excepturi aperiam.</p></div>';
Here is my code:
var $ = cheerio.load(htmlContent2);
//Regex to filter content based on pattern:
var regex1 = /[a-z]\.\s/,
regex2 = /[0-9]\./,
regex3 = /\([a-z]\)/,
regex4 = /\([0-9]\)/;
allList = [];
var newElements = $(".elem b").filter(function () {
var newList = [];
var item1,
item2,
item3,
item4;
newList.push($(this).parent().text());
for (var i = 0, j = newList.length; i < j; i++) {
if (regex1.test($(this).text())) {
item1 = $(this).text();
allList.push(item1);
}
if (regex2.test($(this).text())) {
item2 = $(this).text();
allList.push(item2);
}
if (regex3.test($(this).text())) {
item3 = $(this).text();
allList.push(item3);
}
}
});
console.log(JSON.stringify(allList));
The current result is:
["a. ","1. ","2. ","b. ","1. ","(a).","(b).","2. ","3.","c. "]
The desired result should be:
["a. ",["1. ","2. "],"b. ",["1. ",["(a).","(b)."],"2. ","3."],"c. "]
The array nesting level varies based on the source content. I have searched the forum for similar approaches to no avail.
Thanks
<b>
tags in the html are at the same level