@@ -28,6 +28,9 @@ import org.jsoup.nodes.{Document, Element, Node, TextNode}
2828import scala .collection .JavaConverters ._
2929import scala .collection .mutable
3030import scala .collection .mutable .ArrayBuffer
31+ import java .security .MessageDigest
32+ import java .util .UUID
33+
3134
3235/** Class to parse and read HTML files.
3336 *
@@ -164,8 +167,8 @@ class HTMLReader(
164167 private def startTraversalFromBody (document : Document ): Array [HTMLElement ] = {
165168 try {
166169 val body = document.body()
167- val elements = extractElements(body)
168170 val docTitle = document.title().trim
171+ val elements = extractElements(body)
169172
170173 if (docTitle.nonEmpty && includeTitleTag) {
171174 val titleElem = HTMLElement (
@@ -205,6 +208,11 @@ class HTMLReader(
205208 val trackingNodes = mutable.Map [Node , NodeMetadata ]()
206209 var pageNumber = 1
207210
211+ // Track parent-child hierarchy
212+ var currentParentId : Option [String ] = None
213+
214+ def newUUID (): String = UUID .randomUUID().toString
215+
208216 def isNodeHidden (node : Node ): Boolean = {
209217 node match {
210218 case elem : Element =>
@@ -237,15 +245,13 @@ class HTMLReader(
237245 trackingNodes(elem).visited = true
238246 val text = elem.ownText().trim
239247 if (text.nonEmpty) textBuffer += text
240- // Recursively collect text from all child nodes
241248 elem.childNodes().asScala.foreach(traverseAndCollect)
242249
243250 case _ => // Ignore other node types
244251 }
245252 }
246253 }
247254
248- // Start traversal for each node in the list
249255 nodes.foreach(traverseAndCollect)
250256 textBuffer.mkString(" " ).replaceAll(" \\ s+" , " " ).trim
251257 }
@@ -261,9 +267,7 @@ class HTMLReader(
261267 NodeMetadata (tagName = tagName, hidden = isNodeHidden(childNode), visited = false ))
262268 }
263269
264- if (trackingNodes(node).hidden) {
265- return
266- }
270+ if (trackingNodes(node).hidden) return
267271
268272 node match {
269273 case element : Element =>
@@ -279,48 +283,53 @@ class HTMLReader(
279283 val linkText = element.text().trim
280284 if (href.nonEmpty && linkText.nonEmpty && ! visitedNode) {
281285 trackingNodes(element).visited = true
286+ pageMetadata(" element_id" ) = newUUID()
287+ currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
282288 elements += HTMLElement (
283289 ElementType .LINK ,
284290 content = s " [ $linkText]( $href) " ,
285291 metadata = pageMetadata)
286292 }
293+
287294 case " table" =>
288295 pageMetadata(" sentence" ) = sentenceIndex.toString
289296 sentenceIndex += 1
290297 val tableContent = outputFormat match {
291- case " plain-text" =>
292- extractNestedTableContent(element).trim
298+ case " plain-text" => extractNestedTableContent(element).trim
293299 case " html-table" =>
294300 element
295301 .outerHtml()
296302 .replaceAll(" \\ n" , " " )
297303 .replaceAll(" >\\ s+<" , " ><" )
298304 .replaceAll(" ^\\ s+|\\ s+$" , " " )
299- case " json-table" =>
300- tableElementToJson(element)
301- case _ =>
302- extractNestedTableContent(element).trim
305+ case " json-table" => tableElementToJson(element)
306+ case _ => extractNestedTableContent(element).trim
303307 }
304308 if (tableContent.nonEmpty && ! visitedNode) {
305309 trackingNodes(element).visited = true
310+ pageMetadata(" element_id" ) = newUUID()
311+ currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
306312 elements += HTMLElement (
307313 ElementType .TABLE ,
308314 content = tableContent,
309315 metadata = pageMetadata)
310316 }
317+
311318 case " li" =>
312319 pageMetadata(" sentence" ) = sentenceIndex.toString
313320 sentenceIndex += 1
314321 val itemText = element.text().trim
315322 if (itemText.nonEmpty && ! visitedNode) {
316323 trackingNodes(element).visited = true
324+ pageMetadata(" element_id" ) = newUUID()
325+ currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
317326 elements += HTMLElement (
318327 ElementType .LIST_ITEM ,
319328 content = itemText,
320329 metadata = pageMetadata)
321330 }
331+
322332 case " pre" =>
323- // A <pre> tag typically contains a <code> child
324333 val codeElem = element.getElementsByTag(" code" ).first()
325334 val codeText =
326335 if (codeElem != null ) codeElem.text().trim
@@ -329,22 +338,22 @@ class HTMLReader(
329338 pageMetadata(" sentence" ) = sentenceIndex.toString
330339 sentenceIndex += 1
331340 trackingNodes(element).visited = true
341+ pageMetadata(" element_id" ) = newUUID()
342+ currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
332343 elements += HTMLElement (
333344 ElementType .UNCATEGORIZED_TEXT ,
334345 content = codeText,
335346 metadata = pageMetadata)
336347 }
348+
337349 case tag if isParagraphLikeElement(element) =>
338350 if (! visitedNode) {
339351 val classType = classifyParagraphElement(element)
340-
341- // Traverse children first so that <img>, <a>, etc. inside the paragraph are processed
342352 element.childNodes().asScala.foreach { childNode =>
343353 val tagName = getTagName(childNode)
344354 traverse(childNode, tagName)
345355 }
346356
347- // Now handle the paragraph itself
348357 classType match {
349358 case ElementType .NARRATIVE_TEXT =>
350359 val childNodes = element.childNodes().asScala.toList
@@ -353,6 +362,8 @@ class HTMLReader(
353362 pageMetadata(" sentence" ) = sentenceIndex.toString
354363 sentenceIndex += 1
355364 trackingNodes(element).visited = true
365+ pageMetadata(" element_id" ) = newUUID()
366+ currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
356367 elements += HTMLElement (
357368 ElementType .NARRATIVE_TEXT ,
358369 content = aggregatedText,
@@ -365,10 +376,13 @@ class HTMLReader(
365376 pageMetadata(" sentence" ) = sentenceIndex.toString
366377 sentenceIndex += 1
367378 trackingNodes(element).visited = true
379+ val titleId = newUUID()
380+ pageMetadata(" element_id" ) = titleId
368381 elements += HTMLElement (
369382 ElementType .TITLE ,
370383 content = titleText,
371384 metadata = pageMetadata)
385+ currentParentId = Some (titleId)
372386 }
373387
374388 case ElementType .UNCATEGORIZED_TEXT =>
@@ -377,28 +391,36 @@ class HTMLReader(
377391 pageMetadata(" sentence" ) = sentenceIndex.toString
378392 sentenceIndex += 1
379393 trackingNodes(element).visited = true
394+ pageMetadata(" element_id" ) = newUUID()
395+ currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
380396 elements += HTMLElement (
381397 ElementType .UNCATEGORIZED_TEXT ,
382398 content = text,
383399 metadata = pageMetadata)
384400 }
385401 }
386402 }
403+
387404 case _ if isTitleElement(element) && ! visitedNode =>
388405 trackingNodes(element).visited = true
389406 val titleText = element.text().trim
390407 if (titleText.nonEmpty) {
391408 pageMetadata(" sentence" ) = sentenceIndex.toString
392409 sentenceIndex += 1
410+ val titleId = newUUID()
411+ pageMetadata(" element_id" ) = titleId
393412 elements += HTMLElement (
394413 ElementType .TITLE ,
395414 content = titleText,
396415 metadata = pageMetadata)
416+ currentParentId = Some (titleId)
397417 }
418+
398419 case " hr" =>
399420 if (element.attr(" style" ).toLowerCase.contains(" page-break" )) {
400421 pageNumber = pageNumber + 1
401422 }
423+
402424 case " img" =>
403425 pageMetadata(" sentence" ) = sentenceIndex.toString
404426 sentenceIndex += 1
@@ -411,7 +433,6 @@ class HTMLReader(
411433 val height = element.attr(" height" ).trim
412434
413435 val imgMetadata = mutable.Map [String , String ](" alt" -> alt) ++ pageMetadata
414-
415436 var contentValue = src
416437 if (isBase64) {
417438 val commaIndex = src.indexOf(',' )
@@ -422,14 +443,16 @@ class HTMLReader(
422443 contentValue = base64Payload
423444 }
424445 }
425-
426446 if (width.nonEmpty) imgMetadata(" width" ) = width
427447 if (height.nonEmpty) imgMetadata(" height" ) = height
448+ imgMetadata(" element_id" ) = newUUID()
449+ currentParentId.foreach(pid => imgMetadata(" parent_id" ) = pid)
428450 elements += HTMLElement (
429451 ElementType .IMAGE ,
430452 content = contentValue,
431453 metadata = imgMetadata)
432454 }
455+
433456 case _ =>
434457 element.childNodes().asScala.foreach { childNode =>
435458 val tagName = getTagName(childNode)
@@ -440,7 +463,6 @@ class HTMLReader(
440463 }
441464 }
442465
443- // Start traversal from the root node
444466 val tagName = getTagName(root)
445467 traverse(root, tagName)
446468 elements.toArray
0 commit comments