Join the Stack Overflow Community
Stack Overflow is a community of 6.6 million programmers, just like you, helping each other.
Join them; it only takes a minute:
Sign up

Hello I have been working on the following code and unstructured content with cheerio.js for the past few hours.

So far I have not been successful and would really appreciate your help.

I am iterating over the source content to generate an array only that the output array is not in the proper format.

Here is the source HTML.

var cheerio = require('cheerio');
var htmlContent2 = '<div id="header1"><p class="headpara">Header content</p> <p class="elem"><b>a. </b>Lorem  dolor sit amet, consectetur:</p><p class="elem"><b>1. </b>Perferendis iure doloremque iusto  facilis.</p><p class="elem"><b>2. </b>Asperiores impedit officiis cumque molestias at rerum !</p><p class="elem"><b>b. </b>More dummy text.</p> <p class="elem"><b>1. </b>Additional dummy text: </p> <p class="elem"><b>(a).</b>Asperiores impedit officiis.</p> <p class="elem"><b>(b).</b>Lolestiae asperiores ad repellat est obcaecati.</p> <p class="elem"><b>2. </b>Lorem ipsum dolor sit amet 1.</p> <p class="elem"><b>3.</b>Lorem ipsum dolor sit amet 2. </p> <p class="notelem">Dignissimos maiores facere consequuntur quod.</p><p class="notelem"> maiores facere consequuntur quod.</p>  <p class="elem"><b>c. </b>Ea consectetur excepturi aperiam.</p></div>';

Here is my code:

var $ = cheerio.load(htmlContent2);
//Regex to filter content based on pattern:

var regex1 = /[a-z]\.\s/,
    regex2 = /[0-9]\./,
    regex3 = /\([a-z]\)/,
    regex4 = /\([0-9]\)/;
allList = [];
var newElements = $(".elem b").filter(function () {
    var newList = [];    
    var item1,
    item2,
    item3,
    item4;
    newList.push($(this).parent().text());

    for (var i = 0, j = newList.length; i < j; i++) {
        if (regex1.test($(this).text())) {
            item1 = $(this).text();
           allList.push(item1);
        }
        if (regex2.test($(this).text())) {
            item2 = $(this).text();
            allList.push(item2);           
        }
        if (regex3.test($(this).text())) {
            item3 = $(this).text();
          allList.push(item3);   
        }
    }
});
console.log(JSON.stringify(allList));

The current result is:

["a. ","1. ","2. ","b. ","1. ","(a).","(b).","2. ","3.","c. "]

The desired result should be:

["a. ",["1. ","2. "],"b. ",["1. ",["(a).","(b)."],"2. ","3."],"c. "]

The array nesting level varies based on the source content. I have searched the forum for similar approaches to no avail.

Thanks

share|improve this question
    
what's the criteria for an elements level of nested-ness in the array? seems that all <b> tags in the html are at the same level – hackerrdave Jan 12 at 4:46
    
The <b> tags are at the same level in the source content, which is flat (manually formatted), but I need to create new structured content by nesting the result content properly. So for example I need a. b. c. (first level), 1.2.3 (second level), (a), (b).. (third level) etc. They may occur anywhere in the content. I am able to get the content based on the pattern right but can't figure out how to nest the second and third levels arrays as a whole under their respective parent. – Manou Jan 12 at 4:59
up vote 1 down vote accepted

You were getting close. As long as you only need to test for the 4 conditions you set up as regex this should work fine. Basically, its just checking what level you are on and either pushing or popping an array off a stack to build the structure you want.

EDIT: I cleaned up code so it wouldn't pollute your scope as much. I also changed your regex a little, some were necessary some were just trying to anticipate data changes (i.e. no space after value or lists going higher than 9).

EDIT2: I also noticed that you were trying to do something with the content so I gave you a way to nest by a property of an object that way you can have useful information along with the nested "labels".

EDIT3: Added a print function to help you test the issue you were having and fixed a bug where jumping from a higher level to more than one level below would incorrectly just decrease one level. You'll see we break apart the content so that it can be nested by labels, then recombine them as you wish during output. Here it's just logging but of course could just as easily be appending html tags to a document.

var autoNest = function(list, prop) {

    var regex1 = /[a-z]\.\s?/,
        regex2 = /[0-9]+\.\s?/,
        regex3 = /\([a-z]\)\.\s?/,
        regex4 = /\([0-9]+\)\.\s?/;

    var getLevel = function(text) {
        if (regex1.test(text)) {
            return 0;
        }    
        if (regex2.test(text)) {
            return 1;        
        }
        if (regex3.test(text)) {
            return 2;
        }
        if (regex4.test(text)) {
            return 3;
        }
        throw new Error('Unexpected content');
    };

    var peek = function(arr) {
        return arr[arr.length - 1];
    };

    var result = [];
    var stack = [result];    
    var lastLevel = 0;

    list.forEach(function(obj) {
        var value = prop ? obj[prop] : obj;
        var currentLevel = getLevel(value);
        var arr = peek(stack);
        if (currentLevel > lastLevel) {
            var next = [];
            arr.push(next);
            stack.push(next);
        } else if (currentLevel < lastLevel) {
            var diff = lastLevel - currentLevel;
            for (var i = 0; i < diff; i++) {
                stack.pop();
            }
        }
        arr = peek(stack);
        arr.push(obj);
        lastLevel = currentLevel;
    });
    return result;
};

$('.elem b').each(function() {
    var label = $(this).text();
    var content = $(this).parent().text();
    items.push({
        label: label,
        content: content
    });
});

function printArray(items, level) {
    level = level || 0;
    items.forEach(function(item) {
        if (Array.isArray(item)) {
            printArray(item, level + 1);
        } else {
            var output = '';
            for (var i = 0; i < level; i++) {
                output += '  ';
            }
            output += item.label + ' ' + item.content;
            console.log(output);
        }
    });
}

var nested = autoNest(items, 'label');
printArray(nested);
share|improve this answer
    
thanks so much for taking the time to document and explain what the code does. Will it be possible to only generate the nested array without the labels and also cater for a nesting level is deeper or shallower? – Manou Jan 12 at 21:02
    
Yes. Just pass in an array of strings and call autoNest without second argument (it's optional). To add deeper nesting, just add a regex for each level and modify getLevel to take those into account. – DeezCashews Jan 13 at 2:37
    
after adjusting the code, the results looks like: [ 'a. Lorem dolor sit amet, consectetur:', '1. Perferendis iure doloremque iusto facilis.', [ '2. Asperiores impedit officiis cumque molestias at rerum !' ], 'b. More dummy text.', [ '1. Additional dummy text: ' ], '(a).Asperiores impedit officiis.', '(b).Lolestiae asperiores ad repellat est obcaecati.', [ '2. Lorem ipsum dolor sit amet 1.', '3.Lorem ipsum dolor sit amet 2. ' ], 'c. Ea consectetur excepturi aperiam.' ] – Manou Jan 13 at 6:29
    
But the expected result should be: [ 'a. Lorem dolor sit amet, consectetur:', ['1. Perferendis iure doloremque iusto facilis.', '2. Asperiores impedit officiis cumque molestias at rerum !' ], 'b. More dummy text.', [ '1. Additional dummy text: ' , ['(a).Asperiores impedit officiis.', '(b).Lolestiae asperiores ad repellat est obcaecati.'], '2. Lorem ipsum dolor sit amet 1.', '3.Lorem ipsum dolor sit amet 2. ' ], 'c. Ea consectetur excepturi aperiam.' ]. Any way to achieve this will really help. Every level should be an array. – Manou Jan 13 at 6:36
    
The regex is tailored for just the 1. Or a. Portion. Either use it like I gave with label and content which you can then recombine later when building your output or adjust the regex to only test the beginning of the string. I recommend the former as its already been tested. – DeezCashews Jan 13 at 7:34

Higuys, here is my solution, which will directly transform a style array to b style array.

var a = ["a. ", "1. ", "2. ", "b. ", "1. ", "(a).", "(b).", "2. ", "3.", "c. "];
var b = ["a. ", ["1. ", "2. "], "b. ", ["1. ", ["(a).", "(b)."], "2. ", "3."], "c. "];
var level = function(o) {
    if (/[a-z]\.\s/.test(o))
        return 1;
    if (/[0-9]\./.test(o))
        return 2;
    if (/\([a-z]\)/.test(o))
        return 3;
    if (/\([0-9]\)/.test(o))
        return 4;
    return 0;
};
var shadow = function(arr) {
    var i, j, arr2 = [];
    for (i = 0, j = arr.length; i < j; i++) {
        arr2[i] = level(arr[i]);
    }
    return arr2;
};
var convert = function(arr) {
    var ia, ib, temp;
    var s = shadow(arr);

    if (arr.length > 1) {
        for (ia = 0, ib = 0; ia < arr.length; ia = ia + 2) {
            ib = s.indexOf(s[ia], ia + 1);
            if (ib > -1 && ib - ia > 1) {
                temp = arr.slice(ia + 1, ib);
                arr.splice(ia + 1, ib - ia - 1, convert(temp));
                s.splice(ia + 1, ib - ia - 1, []);
            }
        }
    }
    return arr;
};

Test:

let x = convert(a);
let txt =JSON.stringify(x);
ut.writeLog(txt);<br>
ut.assertEqual(x, b, joCompare);

Test Start - (1) --
["a. ",["1. ","2. "],"b. ",["1. ",["(a).","(b)."],"2. ","3."],"c. "]
1. [PASS]: (...) ==> (Equal) - (Equal) expected

-Test End - (1) --

Total: 1, Passed: 1, Failed: 0, Stat: 100% Passed

share|improve this answer

Did just a few edits in your code. All I do differently is use different arrays for different levels:

var $ = cheerio.load(htmlContent2);
//Regex to filter content based on pattern:

var regex1 = /[a-z]\.\s/,
regex2 = /[0-9]\./,
regex3 = /\([a-z]\)/,
regex4 = /\([0-9]\)/;
allList = [];

//these should be here:
var level2 = [];
var level3 = [];


var newElements = $(".elem b").filter(function () {
var newList = [];    
var item1,
item2,
item3,
item4;
newList.push($(this).parent().text());
//var level2 = [];
//var level3 = [];  

for (var i = 0, j = newList.length; i < j; i++) {
    if (regex1.test($(this).text())) {

        if(level2.length > 0)
            {
            allList.push(level2);
            level2=[];
            }
        if(level3.length > 0)
            {
            level2.push(level3);
            allList.push(level2);
            level3=[];
            }

       item1 = $(this).text();
       allList.push(item1);
       continue;
    }


    if (regex2.test($(this).text())) {

        if(level3.length > 0)
            {
            level2.push(level3);
            level3=[];
            }


        item2 = $(this).text();
        level2.push(item2);
        continue;           
    }
    if (regex3.test($(this).text())) {


        item3 = $(this).text();
        level3.push(item3); 
        continue;
    }
   }
});
console.log(JSON.stringify(allList));
share|improve this answer
    
After testing this code, I only get the following result: ["a. ","b. ","c. "] – Manou Jan 12 at 21:10

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Not the answer you're looking for? Browse other questions tagged or ask your own question.