Given an html page with heading tags such h1
, h2
through h6
and p
tags construct a hierarchical structure. A super problem of this problem is explained in Heading Based Sectional Hierarchy Identification for HTML Documents.
I've written the following code, how can I improve it:
(use-modules (ice-9 match))
(define-syntax test-check
(syntax-rules ()
((_ title tested-expression expected-result)
(begin (format #t "** Checking ~a\n" title)
(let* ((expected expected-result)
(produced tested-expression))
(if (not (equal? expected produced))
(begin (format #t "*** Expected: ~s\n" expected)
(format #t "*** Computed: ~s\n" produced))))))))
(define doc '((h . 1) p (h . 2) p p (h . 2) p p (h . 1) p p))
;; (define out ((div
;; (h1 "title 1")
;; (p "paragraph 1#1")
;; (div
;; (h2 "title 1.1")
;; (p paragraph
(define pp pretty-print)
(define (append-paragraph tree)
(match tree
('() '(p))
(('p . rest) (cons 'p (append-paragraph rest)))
(((('h . level) . rest)) (list (cons (cons 'h level) (append-paragraph rest))))
((head . rest) (cons head (append-paragraph rest)))
(_ (display "fuu ") (pp tree) (error tree))))
(test-check "append-paragraph: null"
(append-paragraph '())
'(p))
(test-check "append-paragraph: single paragraph"
(append-paragraph '(p))
'(p p))
(test-check "append-paragraph: single heading"
(append-paragraph '(((h . 1))))
'(((h . 1) p)))
(test-check "append-paragraph: paragraph followed by a single heading"
(append-paragraph '(p ((h . 1))))
'(p ((h . 1) p)))
(test-check "append-paragraph: paragraph followed by a heading with a paragraph"
(append-paragraph '(p ((h . 1) p)))
'(p ((h . 1) p p)))
(test-check "append-paragraph: nested 0"
(append-paragraph '(p ((h . 1) p) ((h . 1) p ((h . 2) p p))))
'(p ((h . 1) p) ((h . 1) p ((h . 2) p p p))))
(test-check "append-paragraph: nested 1"
(append-paragraph '(p ((h . 1) p) ((h . 1) p ((h . 2) p p) ((h . 2) p)) ((h . 1))))
'(p ((h . 1) p) ((h . 1) p ((h . 2) p p) ((h . 2) p)) ((h . 1) p)))
(test-check "append-paragraph: nested 2"
(append-paragraph '(p ((h . 1) p ((h . 2) p) ((h . 2) p ((h . 3) p) ((h . 3) p)))))
'(p ((h . 1) p ((h . 2) p) ((h . 2) p ((h . 3) p) ((h . 3) p p)))))
(define (append-heading tree level)
(match tree
('() `(((h . ,level))))
(((('h . other) . rest))
(if (eq? other level)
`(((h . ,level) . ,rest) ((h . ,level)))
`(((h . ,other) . ,(append-heading rest level)))))
((head . rest) `(,head . ,(append-heading rest level)))))
(test-check "append-heading: null"
(append-heading '() 1)
'(((h . 1))))
(test-check "append-heading: append h1 to h1"
(append-heading '(((h . 1))) 1)
'(((h . 1)) ((h . 1))))
(test-check "append-heading: append h1 to h1>h2"
(append-heading '(((h . 1) ((h . 2) p))) 1)
'(((h . 1) ((h . 2) p)) ((h . 1))))
(test-check "append-heading: append h2 to h1>h2"
(append-heading '(((h . 1) p ((h . 2) p))) 2)
'(((h . 1) p ((h . 2) p) ((h . 2)))))
(test-check "append-heading: append h3 to h1>h2"
(append-heading '(((h . 1) p ((h . 2) p))) 3)
'(((h . 1) p ((h . 2) p ((h . 3))))))
(test-check "append-heading: append h3 to h1>h2"
(append-heading '(((h . 1) p ((h . 2) p))) 3)
'(((h . 1) p ((h . 2) p ((h . 3))))))
(define (parse doc)
(let loop ((doc doc)
(out '()))
(match doc
('() out)
(('p . rest) (loop rest (append-paragraph out)))
((('h . level) . rest) (loop rest (append-heading out level))))))
(test-check "parse: paragraph"
(parse '(p))
'(p))
(test-check "parse 0"
(parse '(p (h . 1) p (h . 2) p (h . 2) p (h . 3) p (h . 3) p (h . 1) p))
'(p ((h . 1) p ((h . 2) p) ((h . 2) p ((h . 3) p) ((h . 3) p))) ((h . 1) p)))
(test-check "parse 1"
(parse '(p (h . 1) p (h . 2) p (h . 2) p (h . 3) p (h . 3) p p))
'(p ((h . 1) p ((h . 2) p) ((h . 2) p ((h . 3) p) ((h . 3) p p)))))
I read I can use zipper or cursors. Can the implementation of cursors be simpler in my case?