Skip to content

Commit 94fba76

Browse files
committed
[SPARKNLP-1299] Add Hierarchical Element Identification to HTMLReader
1 parent b827818 commit 94fba76

File tree

3 files changed

+179
-19
lines changed

3 files changed

+179
-19
lines changed

src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ import org.jsoup.nodes.{Document, Element, Node, TextNode}
2828
import scala.collection.JavaConverters._
2929
import scala.collection.mutable
3030
import scala.collection.mutable.ArrayBuffer
31+
import java.security.MessageDigest
32+
import java.util.UUID
33+
3134

3235
/** Class to parse and read HTML files.
3336
*
@@ -164,8 +167,8 @@ class HTMLReader(
164167
private def startTraversalFromBody(document: Document): Array[HTMLElement] = {
165168
try {
166169
val body = document.body()
167-
val elements = extractElements(body)
168170
val docTitle = document.title().trim
171+
val elements = extractElements(body)
169172

170173
if (docTitle.nonEmpty && includeTitleTag) {
171174
val titleElem = HTMLElement(
@@ -205,6 +208,11 @@ class HTMLReader(
205208
val trackingNodes = mutable.Map[Node, NodeMetadata]()
206209
var pageNumber = 1
207210

211+
// Track parent-child hierarchy
212+
var currentParentId: Option[String] = None
213+
214+
def newUUID(): String = UUID.randomUUID().toString
215+
208216
def isNodeHidden(node: Node): Boolean = {
209217
node match {
210218
case elem: Element =>
@@ -237,15 +245,13 @@ class HTMLReader(
237245
trackingNodes(elem).visited = true
238246
val text = elem.ownText().trim
239247
if (text.nonEmpty) textBuffer += text
240-
// Recursively collect text from all child nodes
241248
elem.childNodes().asScala.foreach(traverseAndCollect)
242249

243250
case _ => // Ignore other node types
244251
}
245252
}
246253
}
247254

248-
// Start traversal for each node in the list
249255
nodes.foreach(traverseAndCollect)
250256
textBuffer.mkString(" ").replaceAll("\\s+", " ").trim
251257
}
@@ -261,9 +267,7 @@ class HTMLReader(
261267
NodeMetadata(tagName = tagName, hidden = isNodeHidden(childNode), visited = false))
262268
}
263269

264-
if (trackingNodes(node).hidden) {
265-
return
266-
}
270+
if (trackingNodes(node).hidden) return
267271

268272
node match {
269273
case element: Element =>
@@ -279,48 +283,53 @@ class HTMLReader(
279283
val linkText = element.text().trim
280284
if (href.nonEmpty && linkText.nonEmpty && !visitedNode) {
281285
trackingNodes(element).visited = true
286+
pageMetadata("element_id") = newUUID()
287+
currentParentId.foreach(pid => pageMetadata("parent_id") = pid)
282288
elements += HTMLElement(
283289
ElementType.LINK,
284290
content = s"[$linkText]($href)",
285291
metadata = pageMetadata)
286292
}
293+
287294
case "table" =>
288295
pageMetadata("sentence") = sentenceIndex.toString
289296
sentenceIndex += 1
290297
val tableContent = outputFormat match {
291-
case "plain-text" =>
292-
extractNestedTableContent(element).trim
298+
case "plain-text" => extractNestedTableContent(element).trim
293299
case "html-table" =>
294300
element
295301
.outerHtml()
296302
.replaceAll("\\n", "")
297303
.replaceAll(">\\s+<", "><")
298304
.replaceAll("^\\s+|\\s+$", "")
299-
case "json-table" =>
300-
tableElementToJson(element)
301-
case _ =>
302-
extractNestedTableContent(element).trim
305+
case "json-table" => tableElementToJson(element)
306+
case _ => extractNestedTableContent(element).trim
303307
}
304308
if (tableContent.nonEmpty && !visitedNode) {
305309
trackingNodes(element).visited = true
310+
pageMetadata("element_id") = newUUID()
311+
currentParentId.foreach(pid => pageMetadata("parent_id") = pid)
306312
elements += HTMLElement(
307313
ElementType.TABLE,
308314
content = tableContent,
309315
metadata = pageMetadata)
310316
}
317+
311318
case "li" =>
312319
pageMetadata("sentence") = sentenceIndex.toString
313320
sentenceIndex += 1
314321
val itemText = element.text().trim
315322
if (itemText.nonEmpty && !visitedNode) {
316323
trackingNodes(element).visited = true
324+
pageMetadata("element_id") = newUUID()
325+
currentParentId.foreach(pid => pageMetadata("parent_id") = pid)
317326
elements += HTMLElement(
318327
ElementType.LIST_ITEM,
319328
content = itemText,
320329
metadata = pageMetadata)
321330
}
331+
322332
case "pre" =>
323-
// A <pre> tag typically contains a <code> child
324333
val codeElem = element.getElementsByTag("code").first()
325334
val codeText =
326335
if (codeElem != null) codeElem.text().trim
@@ -329,22 +338,22 @@ class HTMLReader(
329338
pageMetadata("sentence") = sentenceIndex.toString
330339
sentenceIndex += 1
331340
trackingNodes(element).visited = true
341+
pageMetadata("element_id") = newUUID()
342+
currentParentId.foreach(pid => pageMetadata("parent_id") = pid)
332343
elements += HTMLElement(
333344
ElementType.UNCATEGORIZED_TEXT,
334345
content = codeText,
335346
metadata = pageMetadata)
336347
}
348+
337349
case tag if isParagraphLikeElement(element) =>
338350
if (!visitedNode) {
339351
val classType = classifyParagraphElement(element)
340-
341-
// Traverse children first so that <img>, <a>, etc. inside the paragraph are processed
342352
element.childNodes().asScala.foreach { childNode =>
343353
val tagName = getTagName(childNode)
344354
traverse(childNode, tagName)
345355
}
346356

347-
// Now handle the paragraph itself
348357
classType match {
349358
case ElementType.NARRATIVE_TEXT =>
350359
val childNodes = element.childNodes().asScala.toList
@@ -353,6 +362,8 @@ class HTMLReader(
353362
pageMetadata("sentence") = sentenceIndex.toString
354363
sentenceIndex += 1
355364
trackingNodes(element).visited = true
365+
pageMetadata("element_id") = newUUID()
366+
currentParentId.foreach(pid => pageMetadata("parent_id") = pid)
356367
elements += HTMLElement(
357368
ElementType.NARRATIVE_TEXT,
358369
content = aggregatedText,
@@ -365,10 +376,13 @@ class HTMLReader(
365376
pageMetadata("sentence") = sentenceIndex.toString
366377
sentenceIndex += 1
367378
trackingNodes(element).visited = true
379+
val titleId = newUUID()
380+
pageMetadata("element_id") = titleId
368381
elements += HTMLElement(
369382
ElementType.TITLE,
370383
content = titleText,
371384
metadata = pageMetadata)
385+
currentParentId = Some(titleId)
372386
}
373387

374388
case ElementType.UNCATEGORIZED_TEXT =>
@@ -377,28 +391,36 @@ class HTMLReader(
377391
pageMetadata("sentence") = sentenceIndex.toString
378392
sentenceIndex += 1
379393
trackingNodes(element).visited = true
394+
pageMetadata("element_id") = newUUID()
395+
currentParentId.foreach(pid => pageMetadata("parent_id") = pid)
380396
elements += HTMLElement(
381397
ElementType.UNCATEGORIZED_TEXT,
382398
content = text,
383399
metadata = pageMetadata)
384400
}
385401
}
386402
}
403+
387404
case _ if isTitleElement(element) && !visitedNode =>
388405
trackingNodes(element).visited = true
389406
val titleText = element.text().trim
390407
if (titleText.nonEmpty) {
391408
pageMetadata("sentence") = sentenceIndex.toString
392409
sentenceIndex += 1
410+
val titleId = newUUID()
411+
pageMetadata("element_id") = titleId
393412
elements += HTMLElement(
394413
ElementType.TITLE,
395414
content = titleText,
396415
metadata = pageMetadata)
416+
currentParentId = Some(titleId)
397417
}
418+
398419
case "hr" =>
399420
if (element.attr("style").toLowerCase.contains("page-break")) {
400421
pageNumber = pageNumber + 1
401422
}
423+
402424
case "img" =>
403425
pageMetadata("sentence") = sentenceIndex.toString
404426
sentenceIndex += 1
@@ -411,7 +433,6 @@ class HTMLReader(
411433
val height = element.attr("height").trim
412434

413435
val imgMetadata = mutable.Map[String, String]("alt" -> alt) ++ pageMetadata
414-
415436
var contentValue = src
416437
if (isBase64) {
417438
val commaIndex = src.indexOf(',')
@@ -422,14 +443,16 @@ class HTMLReader(
422443
contentValue = base64Payload
423444
}
424445
}
425-
426446
if (width.nonEmpty) imgMetadata("width") = width
427447
if (height.nonEmpty) imgMetadata("height") = height
448+
imgMetadata("element_id") = newUUID()
449+
currentParentId.foreach(pid => imgMetadata("parent_id") = pid)
428450
elements += HTMLElement(
429451
ElementType.IMAGE,
430452
content = contentValue,
431453
metadata = imgMetadata)
432454
}
455+
433456
case _ =>
434457
element.childNodes().asScala.foreach { childNode =>
435458
val tagName = getTagName(childNode)
@@ -440,7 +463,6 @@ class HTMLReader(
440463
}
441464
}
442465

443-
// Start traversal from the root node
444466
val tagName = getTagName(root)
445467
traverse(root, tagName)
446468
elements.toArray
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
<!-- File: /index.html -->
2+
<!doctype html>
3+
<html lang="en">
4+
<head>
5+
<meta charset="utf-8" />
6+
<title>Simple Book: 3 Chapters</title>
7+
<meta name="viewport" content="width=device-width, initial-scale=1" />
8+
<style>
9+
/* Keep navigation readable on small screens */
10+
body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; line-height: 1.6; margin: 2rem; }
11+
nav ul { list-style: none; padding: 0; }
12+
nav li { margin: 0.25rem 0; }
13+
a { text-decoration: none; }
14+
a:hover { text-decoration: underline; }
15+
hr { margin: 2rem 0; }
16+
.back { display: inline-block; margin-top: 0.5rem; }
17+
</style>
18+
</head>
19+
<body>
20+
<h1 id="top">Simple Book</h1>
21+
22+
<nav aria-label="Chapter index">
23+
<h2>Index</h2>
24+
<ul>
25+
<li><a href="#chapter-1">Chapter 1: Beginnings</a></li>
26+
<li><a href="#chapter-2">Chapter 2: Middle Path</a></li>
27+
<li><a href="#chapter-3">Chapter 3: Finishing Touch</a></li>
28+
</ul>
29+
</nav>
30+
31+
<hr />
32+
33+
<section id="chapter-1">
34+
<h2>Chapter 1: Beginnings</h2>
35+
<p>
36+
Every project starts with a simple idea and a clear intention. In this chapter, we set the stage and outline the basic goals.
37+
Small steps help build momentum and reduce uncertainty. With a plan in place, moving forward becomes much easier.
38+
</p>
39+
<a class="back" href="#top">Back to top</a>
40+
</section>
41+
42+
<hr />
43+
44+
<section id="chapter-2">
45+
<h2>Chapter 2: Middle Path</h2>
46+
<p>
47+
Progress is rarely a straight line, and that is perfectly fine. Here we adjust our approach based on what we learn.
48+
Iteration helps refine ideas and improves the final outcome. Staying flexible keeps the project healthy and on track.
49+
</p>
50+
<a class="back" href="#top">Back to top</a>
51+
</section>
52+
53+
<hr />
54+
55+
<section id="chapter-3">
56+
<h2>Chapter 3: Finishing Touch</h2>
57+
<p>
58+
The final phase focuses on clarity and polish. We review the work, remove distractions, and keep what matters.
59+
A simple, tidy result is easier to use and maintain. With that, the project is ready to share.
60+
</p>
61+
<a class="back" href="#top">Back to top</a>
62+
</section>
63+
</body>
64+
</html>

src/test/scala/com/johnsnowlabs/reader/HTMLReaderTest.scala

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ class HTMLReaderTest extends AnyFlatSpec {
125125
it should "correctly parse caption and th tags" taggedAs FastTest in {
126126
val HTMLReader = new HTMLReader()
127127
val htmlDF = HTMLReader.read(s"$htmlFilesDirectory/example-caption-th.html")
128+
htmlDF.show(truncate = false)
128129
val titleDF = htmlDF
129130
.select(explode(col("html")).as("exploded_html"))
130131
.filter(col("exploded_html.elementType") === ElementType.TABLE)
@@ -185,4 +186,77 @@ class HTMLReaderTest extends AnyFlatSpec {
185186
assert(imagesDF.count() == 1)
186187
}
187188

189+
it should "include parent and element ids" taggedAs FastTest in {
190+
val HTMLReader = new HTMLReader()
191+
val htmlDF = HTMLReader.read(s"$htmlFilesDirectory/simple-book.html")
192+
htmlDF.show(truncate = false)
193+
val parentChildDF = htmlDF
194+
.select(explode(col("html")).as("exploded_html"))
195+
196+
parentChildDF.show(truncate = false)
197+
198+
// assert(parentChildDF.count() == 3)
199+
}
200+
201+
it should "produce valid element_id and parent_id relationships" taggedAs FastTest in {
202+
val HTMLReader = new HTMLReader()
203+
val htmlDF = HTMLReader.read(s"$htmlFilesDirectory/simple-book.html")
204+
205+
val explodedDF = htmlDF
206+
.select(explode(col("html")).as("elem"))
207+
.select(
208+
col("elem.elementType").as("elementType"),
209+
col("elem.content").as("content"),
210+
col("elem.metadata").as("metadata")
211+
)
212+
.withColumn("element_id", col("metadata")("element_id"))
213+
.withColumn("parent_id", col("metadata")("parent_id"))
214+
.cache() // << important to prevent recomputation inconsistencies
215+
216+
val allElementIds = explodedDF
217+
.select("element_id")
218+
.where(col("element_id").isNotNull)
219+
.distinct()
220+
.collect()
221+
.map(_.getString(0))
222+
.toSet
223+
224+
val allParentIds = explodedDF
225+
.select("parent_id")
226+
.where(col("parent_id").isNotNull)
227+
.distinct()
228+
.collect()
229+
.map(_.getString(0))
230+
.toSet
231+
232+
// 1. There should be at least one element with an element_id
233+
assert(allElementIds.nonEmpty, "No elements have element_id metadata")
234+
235+
// 2. There should be at least one element with a parent_id
236+
assert(allParentIds.nonEmpty, "No elements have parent_id metadata")
237+
238+
// 3. Every parent_id should exist as an element_id
239+
val missingParents = allParentIds.diff(allElementIds)
240+
assert(
241+
missingParents.isEmpty,
242+
s"Some parent_ids do not correspond to existing element_ids: $missingParents"
243+
)
244+
245+
// 4. Each parent should have at least one child
246+
val parentChildCount = explodedDF
247+
.filter(col("parent_id").isNotNull)
248+
.groupBy("parent_id")
249+
.count()
250+
.collect()
251+
.map(r => r.getString(0) -> r.getLong(1))
252+
.toMap
253+
254+
assert(
255+
parentChildCount.nonEmpty && parentChildCount.values.forall(_ >= 1),
256+
"Each parent_id should have at least one child element"
257+
)
258+
}
259+
260+
261+
188262
}

0 commit comments

Comments
 (0)