diff --git a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb index c5eecc20945f3f..659bdbc309f127 100644 --- a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb +++ b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb @@ -20,8 +20,30 @@ "## Setup and Initialization\n", "Let's keep in mind a few things before we start 😊\n", "\n", - "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release.\n", - "\n", + "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "For local files example we will download different files from Spark NLP Github repo:" ] }, @@ -42,34 +64,34 @@ "base_uri": "https://localhost:8080/" }, "id": "bo7s-jZVrE7W", - "outputId": "e7234d36-765e-4a29-f922-02ceab1626dd" + "outputId": "b0e91448-3b2c-4dab-84c7-5e7d8bad0be5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:05-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n", + "--2025-06-09 22:10:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 2456707 (2.3M) [text/plain]\n", "Saving to: ‘html-files/example-10k.html’\n", "\n", - "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.05s \n", + "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.04s \n", "\n", - "2025-05-26 23:11:06 (45.1 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n", + "2025-06-09 22:10:23 (52.9 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n", "\n", - "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 665 [text/plain]\n", "Saving to: ‘html-files/fake-html.html’\n", "\n", "fake-html.html 100%[===================>] 665 --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:06 (30.2 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", + "2025-06-09 22:10:24 (18.3 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", "\n" ] } @@ -97,38 +119,36 @@ "base_uri": "https://localhost:8080/" }, "id": "ya8qZe00dalC", - "outputId": "ba520f44-c4b9-45b1-f03c-6a8e3a33320b" + "outputId": "9b4fbf52-9ecc-454b-bef1-0ce31dadb7c7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", + "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 15629 (15K) [application/octet-stream]\n", "Saving to: ‘pdf-files/image_3_pages.pdf’\n", "\n", - "\r", - "image_3_pages.pdf 0%[ ] 0 --.-KB/s \r", "image_3_pages.pdf 100%[===================>] 15.26K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:06 (25.5 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n", + "2025-06-09 22:10:24 (24.3 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n", "\n", - "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n", + "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 25803 (25K) [application/octet-stream]\n", "Saving to: ‘pdf-files/pdf-title.pdf’\n", "\n", - "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0s \n", + "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:06 (58.5 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n", + "2025-06-09 22:10:24 (21.2 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n", "\n", - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n", + "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -137,7 +157,7 @@ "\n", "text_3_pages.pdf 100%[===================>] 9.26K --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:07 (79.2 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n", + "2025-06-09 22:10:24 (73.3 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n", "\n" ] } @@ -166,47 +186,45 @@ "base_uri": "https://localhost:8080/" }, "id": "zLLEUl3KpYZ6", - "outputId": "4346e6e1-18ec-47a8-92c0-c8bc588f3441" + "outputId": "407e9405-6cc9-4724-f576-f52c503cb52d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n", + "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 95087 (93K) [application/octet-stream]\n", "Saving to: ‘word-files/contains-pictures.docx’\n", "\n", - "\r", - "contains-pictures.d 0%[ ] 0 --.-KB/s \r", - "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.01s \n", + "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.02s \n", "\n", - "2025-05-26 23:11:07 (6.85 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n", + "2025-06-09 22:10:25 (4.74 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n", "\n", - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 12392 (12K) [application/octet-stream]\n", "Saving to: ‘word-files/fake_table.docx’\n", "\n", "fake_table.docx 100%[===================>] 12.10K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:07 (17.7 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n", + "2025-06-09 22:10:25 (18.9 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n", "\n", - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 14584 (14K) [application/octet-stream]\n", "Saving to: ‘word-files/page-breaks.docx’\n", "\n", "page-breaks.docx 100%[===================>] 14.24K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:08 (22.4 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n", + "2025-06-09 22:10:25 (21.5 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n", "\n" ] } @@ -235,48 +253,58 @@ "base_uri": "https://localhost:8080/" }, "id": "G3-BCYP6qQ4x", - "outputId": "38489a6e-588d-4a1b-e319-0c7f66559ca0" + "outputId": "95c5a31d-eed9-47a1-bb55-0868daec7da7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 12541 (12K) [application/octet-stream]\n", "Saving to: ‘excel-files/vodafone.xlsx’\n", "\n", "\r", "vodafone.xlsx 0%[ ] 0 --.-KB/s \r", - "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0.001s \n", + "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:08 (22.2 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n", + "2025-06-09 22:10:26 (30.4 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n", "\n", - "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 38442 (38K) [application/octet-stream]\n", "Saving to: ‘excel-files/2023-half-year-analyses-by-segment.xlsx’\n", "\n", - "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.007s \n", + "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.01s \n", "\n", - "2025-05-26 23:11:08 (5.37 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n", + "2025-06-09 22:10:26 (3.43 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n", "\n", - "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 404 Not Found\n", - "2025-05-26 23:11:09 ERROR 404: Not Found.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10676 (10K) [application/octet-stream]\n", + "Saving to: ‘excel-files/page-break-example.xlsx’\n", + "\n", + "page-break-example. 100%[===================>] 10.43K --.-KB/s in 0s \n", "\n", - "--2025-05-26 23:11:09-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n", + "2025-06-09 22:10:26 (79.4 MB/s) - ‘excel-files/page-break-example.xlsx’ saved [10676/10676]\n", + "\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 404 Not Found\n", - "2025-05-26 23:11:09 ERROR 404: Not Found.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9210 (9.0K) [application/octet-stream]\n", + "Saving to: ‘excel-files/xlsx-subtable-cases.xlsx’\n", + "\n", + "xlsx-subtable-cases 100%[===================>] 8.99K --.-KB/s in 0s \n", + "\n", + "2025-06-09 22:10:26 (65.5 MB/s) - ‘excel-files/xlsx-subtable-cases.xlsx’ saved [9210/9210]\n", "\n" ] } @@ -289,17 +317,6 @@ "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx -P excel-files" ] }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "LcSYn6q7jW9-" - }, - "outputs": [], - "source": [ - "!cp drive/MyDrive/JSL/PageBreakExample.xlsx ./excel-files" - ] - }, { "cell_type": "markdown", "metadata": { @@ -317,42 +334,45 @@ "base_uri": "https://localhost:8080/" }, "id": "1jDRFmcHqpxn", - "outputId": "4d59c445-3764-41a8-c91b-9231d401eac6" + "outputId": "cd7e3c96-bb5f-49ab-f466-56ec6be20f75" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:09-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:27-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 38412 (38K) [application/octet-stream]\n", "Saving to: ‘ppt-files/fake-power-point.pptx’\n", "\n", - "\r", - "fake-power-point.pp 0%[ ] 0 --.-KB/s \r", - "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.007s \n", + "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.01s \n", "\n", - "2025-05-26 23:11:10 (5.29 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n", + "2025-06-09 22:10:27 (3.41 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n", "\n", - "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n", + "--2025-06-09 22:10:27-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 39894 (39K) [application/octet-stream]\n", "Saving to: ‘ppt-files/fake-power-point-table.pptx’\n", "\n", - "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.006s \n", + "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.008s \n", "\n", - "2025-05-26 23:11:10 (6.73 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n", + "2025-06-09 22:10:28 (4.93 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n", "\n", - "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n", + "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 404 Not Found\n", - "2025-05-26 23:11:10 ERROR 404: Not Found.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 39414 (38K) [application/octet-stream]\n", + "Saving to: ‘ppt-files/speaker-notes.pptx’\n", + "\n", + "speaker-notes.pptx 100%[===================>] 38.49K --.-KB/s in 0.008s \n", + "\n", + "2025-06-09 22:10:28 (4.76 MB/s) - ‘ppt-files/speaker-notes.pptx’ saved [39414/39414]\n", "\n" ] } @@ -381,14 +401,14 @@ "base_uri": "https://localhost:8080/" }, "id": "yYMVpVQurk7G", - "outputId": "cedb0e39-f137-4759-a158-0b84ed31b282" + "outputId": "293a864a-2980-4502-c6dc-a1d3cee815ee" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n", + "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -399,18 +419,18 @@ " email-tex 0%[ ] 0 --.-KB/s \r", "email-text-attachme 100%[===================>] 3.10K --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:11 (49.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n", + "2025-06-09 22:10:28 (21.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n", "\n", - "--2025-05-26 23:11:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 1324361 (1.3M) [text/plain]\n", "Saving to: ‘email-files/test-several-attachments.eml’\n", "\n", "test-several-attach 100%[===================>] 1.26M --.-KB/s in 0.04s \n", "\n", - "2025-05-26 23:11:11 (32.0 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n", + "2025-06-09 22:10:29 (30.2 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n", "\n" ] } @@ -438,14 +458,14 @@ "base_uri": "https://localhost:8080/" }, "id": "AV-krG6Ps8pq", - "outputId": "c407a77f-11d5-4a3c-85e0-4abffa48bd12" + "outputId": "bd7317e0-97d3-4f30-a800-6ffa8148f266" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n", + "--2025-06-09 22:10:29-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -456,7 +476,7 @@ "simple-text.txt 0%[ ] 0 --.-KB/s \r", "simple-text.txt 100%[===================>] 300 --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:11 (4.81 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n", + "2025-06-09 22:10:29 (3.39 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n", "\n" ] } @@ -466,6 +486,51 @@ "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "QVq5C0Uqs4wU" + }, + "source": [ + "**Downloading XML files**" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Gip5P7Ess63U", + "outputId": "dde0fa15-2571-4b4a-ef73-517fe2b7a7a7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-06-09 22:15:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 538 [text/plain]\n", + "Saving to: ‘xml-files/multi-level.xml’\n", + "\n", + "\r", + "multi-level.xml 0%[ ] 0 --.-KB/s \r", + "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n", + "\n", + "2025-06-09 22:15:15 (21.2 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files" + ] + }, { "cell_type": "markdown", "metadata": { @@ -478,13 +543,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bAkMjJ1vdalE", - "outputId": "15401bcc-3cb2-474a-d771-0efed1eaf9cd" + "outputId": "582dcc26-76ea-4cac-c5f6-46e009b639f9" }, "outputs": [ { @@ -519,13 +584,13 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VWbUgoVQrO8m", - "outputId": "36bbf310-7ee5-474a-93f2-4d940d3c0547" + "outputId": "56f4f9ce-41bb-48ba-b5db-7e1bde47d8d8" }, "outputs": [ { @@ -558,13 +623,13 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YFzeGJJ3ICVM", - "outputId": "01c349aa-16d2-4e0d-8a30-11399caf2ef2" + "outputId": "fc9bc68c-2b20-479e-8fe8-3e380877cebf" }, "outputs": [ { @@ -597,13 +662,13 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y_xl0ahaJ0Hy", - "outputId": "6040b119-2eca-4c58-f51b-e20fbefeef8d" + "outputId": "327222b8-0c6b-4578-8fde-4f14f9835edc" }, "outputs": [ { @@ -636,13 +701,13 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4JnKvFe5KVDf", - "outputId": "d91d1ee5-d4a3-48a1-b40a-d5f6bf997025" + "outputId": "c9252fb7-3840-4c95-d461-a56eef9adaea" }, "outputs": [ { @@ -675,13 +740,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_sldwjppKoPl", - "outputId": "467e9085-86dd-43df-f63b-a707b920d3b3" + "outputId": "0619383d-abf4-43a6-f63d-ad81897f8d9e" }, "outputs": [ { @@ -714,13 +779,13 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GltbZAjmKwQs", - "outputId": "c3f18b1f-06df-4233-8874-e9702c465e69" + "outputId": "df9ae11b-0186-4e61-d6ff-9581c597ccd1" }, "outputs": [ { @@ -731,9 +796,9 @@ "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n", "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", "\n" ] @@ -798,7 +863,9 @@ "| `infer_table_structure` | Word, Excel, PowerPoint | Whether to generate an HTML table representation from structured table content. When enabled, a full `` element is added alongside cell-level elements, based on row and column layout. |\n", "| `append_cells` | Excel | Whether to append all rows into a single content block instead of creating separate elements per row. |\n", "| `cell_separator` | Excel | String used to join cell values in a row when assembling textual output |\n", - "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |" + "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |\n", + "| `xml_keep_tags` | XML | Whether to retain original XML tag names and include them in the metadata for each extracted element |\n", + "| `only_leaf_nodes` | XML | If true, only the deepest elements are extracted. If false, all elements are extracted|" ] }, { @@ -812,13 +879,13 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gDJyUi_9R4fr", - "outputId": "4aebe625-444d-4161-be23-512708ced1b5" + "outputId": "181d8e88-7a0b-4a6e-f497-7fd4add3726c" }, "outputs": [ { @@ -830,8 +897,8 @@ "| path| doc|\n", "+--------------------+--------------------+\n", "|file:/content/wor...|[{NarrativeText, ...|\n", - "|file:/content/wor...|[{Header, An inli...|\n", "|file:/content/wor...|[{Table, Header C...|\n", + "|file:/content/wor...|[{Header, An inli...|\n", "+--------------------+--------------------+\n", "\n" ] @@ -843,50 +910,23 @@ ] }, { - "cell_type": "code", - "execution_count": 23, + "cell_type": "markdown", "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3vz48AHQHyON", - "outputId": "f3ba8c4b-3bfc-453a-d8d4-f86a5fca0a1b" + "id": "F0lCz9OyPYYh" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Warning::Spark Session already created, some configs may not take.\n", - "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n", - "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 25803|This is a Title \\...| 842| 596| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 1|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 2|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", - "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "\n" - ] - } - ], "source": [ - "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n", - "partition_df.show()" + "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "S50lqIFskNO3", - "outputId": "e52f4cde-cfb9-4a55-d989-6e9fe40a0321" + "id": "qExdRJ2aPsYV", + "outputId": "9a033a02-4bae-4570-aaba-b81c23b8e0e1" }, "outputs": [ { @@ -894,38 +934,40 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|path |xls |\n", - "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|file:/content/excel-files/PageBreakExample.xlsx|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}]|\n", - "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+--------------------+--------------------+--------------------+\n", + "| path| doc| content|\n", + "+--------------------+--------------------+--------------------+\n", + "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n", + "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n", + "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n", + "+--------------------+--------------------+--------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(content_type = \"application/vnd.ms-excel\").partition(\"./excel-files/PageBreakExample.xlsx\")\n", - "partition_df.show(truncate=False)" + "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n", + "partition_df.show()" ] }, { "cell_type": "markdown", "metadata": { - "id": "F0lCz9OyPYYh" + "id": "E3bCFJZn8TS0" }, "source": [ - "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output" + "## Partitioning PDF Files" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "qExdRJ2aPsYV", - "outputId": "0284de34-ce6a-4d1e-91bc-268521111015" + "id": "3vz48AHQHyON", + "outputId": "19369e63-f963-4422-a791-57ea5394df1a" }, "outputs": [ { @@ -933,19 +975,23 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+--------------------+--------------------+--------------------+\n", - "| path| doc| content|\n", - "+--------------------+--------------------+--------------------+\n", - "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n", - "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n", - "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n", - "+--------------------+--------------------+--------------------+\n", + "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", + "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n", + "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 25803|This is a Title \\...| 842| 596| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 1|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 2|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", + "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n", + "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n", "partition_df.show()" ] }, @@ -969,13 +1015,13 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_2J0zEmma8jm", - "outputId": "405391bf-60bf-4632-ef0e-e84496049c71" + "outputId": "90f668d7-03d9-496f-dc82-a620c59f9c08" }, "outputs": [ { @@ -1018,13 +1064,13 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4sY2ADN8dusy", - "outputId": "98af2c82-8a55-46ff-f631-7775431820cb" + "outputId": "8164237e-6835-404a-d7a7-b5ef0ef99c6d" }, "outputs": [ { @@ -1046,24 +1092,33 @@ "partition_df.show(truncate=False)" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMyqJX-K7dss" + }, + "source": [ + "## Partitioning MS Office documents" + ] + }, { "cell_type": "markdown", "metadata": { "id": "_9dDTCrpGdoN" }, "source": [ - "For Word documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output." + "For Excel documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output." ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7ICTZmLGk3Sa", - "outputId": "5e31a551-2746-4c45-b933-56f55e4866c9" + "outputId": "1796055a-808c-4eff-fc86-14e29cf9b53e" }, "outputs": [ { @@ -1087,13 +1142,13 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YId4UG1rOVQq", - "outputId": "7de8b4be-9936-4330-8a0f-019c3a55182a" + "outputId": "32827dea-d7b3-4137-abff-9e4502f8cd93" }, "outputs": [ { @@ -1118,38 +1173,21 @@ { "cell_type": "markdown", "metadata": { - "id": "jpRmFNPNNqkf" - }, - "source": [ - "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "id": "HwnYBQ5l7rDM" + "id": "E8ockED4NxLi" }, - "outputs": [], "source": [ - "text = (\n", - " \"The big brown fox\\n\"\n", - " \"was walking down the lane.\\n\"\n", - " \"\\n\"\n", - " \"At the end of the lane,\\n\"\n", - " \"the fox met a bear.\"\n", - " )" + "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output." ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "mutwZUFj720X", - "outputId": "87cd31c5-2f94-4777-9ea5-b6edf8277347" + "id": "fPCpk7RTGRjo", + "outputId": "a818ecd7-8580-4098-b30f-6e46b8ef6baa" }, "outputs": [ { @@ -1157,61 +1195,77 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|txt |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|path |ppt |\n", + "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n", + "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n", - "text_df.show(truncate=False)" + "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n", + "partition_df.show(truncate=False)" ] }, { "cell_type": "markdown", "metadata": { - "id": "E8ockED4NxLi" + "id": "qRfRSGvhN303" }, "source": [ - "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output." + "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display." ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "xF8F-5CP3qWY", - "outputId": "71b5e0cb-b22a-4774-a7b6-83c4fd67fadb" + "id": "twLdjGxZWiOJ", + "outputId": "8adcaa80-b02c-4e8f-8205-20efa8c40b4b" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "fake-power-point.pptx fake-power-point-table.pptx\n" + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xls |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML,
DateFri Jul 19 00:00:00 UTC 2024
AssetsDebts
Bank15865.43Credit Card12000.0
Bank210140.19Credit Card21500.0
Bank31200.0Credit Card3348.0
Bank41438.27TotalSUM(F3:F5)
TotalSUM(B3:B6)
, {SheetName -> Sheet1}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" ] } ], "source": [ - "!ls ppt-files" + "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n", + "partition_df.select(\"xls\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8txswwbjN8Mg" + }, + "source": [ + "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually." ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "fPCpk7RTGRjo", - "outputId": "74144c26-5060-4c99-f291-a097b838e774" + "id": "PQ4MpGw6xCko", + "outputId": "aaf807a7-27b9-40cc-8a75-58be077f8403" }, "outputs": [ { @@ -1219,38 +1273,64 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|path |ppt |\n", - "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n", - "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xls |\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n", - "partition_df.show(truncate=False)" + "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n", + "partition_df.select(\"xls\").show(truncate=False)" ] }, { "cell_type": "markdown", "metadata": { - "id": "qRfRSGvhN303" + "id": "_GyL6D4N75i-" }, "source": [ - "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display." + "## Partitioning Text Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jpRmFNPNNqkf" + }, + "source": [ + "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "HwnYBQ5l7rDM" + }, + "outputs": [], + "source": [ + "text = (\n", + " \"The big brown fox\\n\"\n", + " \"was walking down the lane.\\n\"\n", + " \"\\n\"\n", + " \"At the end of the lane,\\n\"\n", + " \"the fox met a bear.\"\n", + " )" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "twLdjGxZWiOJ", - "outputId": "ec340358-7279-4247-b27c-5a0a25f38ee6" + "id": "mutwZUFj720X", + "outputId": "8b4f474d-2f3f-4e81-cecf-5de420561124" }, "outputs": [ { @@ -1258,38 +1338,47 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|xls |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML,
DateFri Jul 19 00:00:00 UTC 2024
AssetsDebts
Bank15865.43Credit Card12000.0
Bank210140.19Credit Card21500.0
Bank31200.0Credit Card3348.0
Bank41438.27TotalSUM(F3:F5)
TotalSUM(B3:B6)
, {SheetName -> Sheet1}}]|\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|txt |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n", - "partition_df.select(\"xls\").show(truncate=False)" + "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n", + "text_df.show(truncate=False)" ] }, { "cell_type": "markdown", "metadata": { - "id": "8txswwbjN8Mg" + "id": "epCp5DnQ8E7o" }, "source": [ - "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually." + "## Partitioning XML Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DWX0nkc4tM7J" + }, + "source": [ + "In Spark NLP 6.0.3 we added support for XML files" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 45, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "PQ4MpGw6xCko", - "outputId": "808783d2-f15b-45ae-90fb-a623243898f3" + "id": "AViMSzKQtP-o", + "outputId": "147a1ef9-3f14-4832-a050-e60c8ac9544b" }, "outputs": [ { @@ -1297,18 +1386,18 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|xls |\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xml |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n", - "partition_df.select(\"xls\").show(truncate=False)" + "partition_df = Partition(xml_keep_tags = True).partition(\"./xml-files/multi-level.xml\")\n", + "partition_df.select(\"xml\").show(truncate=False)" ] } ], diff --git a/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb new file mode 100644 index 00000000000000..38b43aed37b95e --- /dev/null +++ b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb @@ -0,0 +1,339 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing XML reader in SparkNLP\n", + "This notebook showcases the newly added `sparknlp.read().xml()` method in Spark NLP that parses XML content from both local files and real-time URLs into a Spark DataFrame.\n", + "\n", + "**Key Features:**\n", + "- Ability to parse XML from local directories and URLs.\n", + "- Versatile support for varied data ingestion scenarios." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RFOFhaEedalB" + }, + "source": [ + "## Setup and Initialization\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "Support for reading xml files was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y3hWfT5q-npM" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "u3ORYVyb-pRI" + }, + "outputs": [], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oIbFQyEo-tat" + }, + "source": [ + "For local files example we will download a couple of XML files from Spark NLP Github repo:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ya8qZe00dalC", + "outputId": "7d597910-9826-4472-9fdc-5b8ac398e6cf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-06-09 21:43:40-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 538 [text/plain]\n", + "Saving to: ‘xml-files/multi-level.xml’\n", + "\n", + "\r", + "multi-level.xml 0%[ ] 0 --.-KB/s \r", + "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n", + "\n", + "2025-06-09 21:43:40 (34.0 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n", + "\n", + "--2025-06-09 21:43:40-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/test.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 382 [text/plain]\n", + "Saving to: ‘xml-files/test.xml’\n", + "\n", + "test.xml 100%[===================>] 382 --.-KB/s in 0s \n", + "\n", + "2025-06-09 21:43:40 (7.58 MB/s) - ‘xml-files/test.xml’ saved [382/382]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/test.xml -P xml-files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EoFI66NAdalE" + }, + "source": [ + "## Parsing XML from Local Files\n", + "Use the `xml()` method to parse XML content from local directories." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bAkMjJ1vdalE", + "outputId": "0bba10be-75de-48de-9a06-d6197d35218f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+--------------------+--------------------+\n", + "| path| xml|\n", + "+--------------------+--------------------+\n", + "|file:/content/xml...|[{Title, Harry Po...|\n", + "|file:/content/xml...|[{Title, The Alch...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "xml_df = sparknlp.read().xml(\"./xml-files\")\n", + "\n", + "xml_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oBj0cHPXSD1m", + "outputId": "00951736-40d4-4f9e-fe25-cc5117405269" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- path: string (nullable = true)\n", + " |-- xml: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- elementType: string (nullable = true)\n", + " | | |-- content: string (nullable = true)\n", + " | | |-- metadata: map (nullable = true)\n", + " | | | |-- key: string\n", + " | | | |-- value: string (valueContainsNull = true)\n", + "\n" + ] + } + ], + "source": [ + "xml_df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FrVKxdySz8pR" + }, + "source": [ + "### Configuration Parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CC_klLwhV8um" + }, + "source": [ + "`xmlKeepTags`: When true, includes the tag name of each XML element in the metadata under the key `tag`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aNfN0fQC0Vzz", + "outputId": "ebdb1393-b91c-4c60-d7e7-b7ecc6465171" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xml |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> title}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> author}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> year}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> price}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> title}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> author}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> year}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> price}}]|\n", + "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}] |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"xmlKeepTags\": \"true\"}\n", + "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n", + "xml_df.select(\"xml\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t06KtTItWQ4R" + }, + "source": [ + "`onlyLeafNodes`: When true, includes only leaf elements (i.e., elements with no child elements) in the output. When false, all elements (including containers) are included." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jTM1btqNntUL", + "outputId": "f86a0b28-73ac-46d1-8d26-f920e2d935cd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xml |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{UncategorizedText, , {elementId -> 931f811d0c9b488a01a7875f80992a62}}, {UncategorizedText, , {elementId -> 1f610d9429ab17d0d7ab49ee3069b4fc, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, , {elementId -> 249aff1b3e9835325b45e51cdfc4ad46, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}] |\n", + "|[{UncategorizedText, , {elementId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> d7416d9cac3ba3af57ef6b6b71d7841b, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> b79ae4ca74ec00f63a00b6cd66acc1e0, parentId -> d7416d9cac3ba3af57ef6b6b71d7841b}}, {UncategorizedText, , {elementId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, parentId -> b79ae4ca74ec00f63a00b6cd66acc1e0}}, {Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, , {elementId -> 9ebecf846e7dea80c563ebcb2f7d4a9a, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> 80472cd1880f453b8adecc61870748ba, parentId -> 9ebecf846e7dea80c563ebcb2f7d4a9a}}, {UncategorizedText, , {elementId -> 9708b29025b53d9f54c723ee005b647b, parentId -> 80472cd1880f453b8adecc61870748ba}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"onlyLeafNodes\": \"false\"}\n", + "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n", + "xml_df.select(\"xml\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O8DePUq8nkYm" + }, + "source": [ + "You can access the raw content of the file using the `storeContent` parameter" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E0S5aRb5WFLf", + "outputId": "5e624eeb-fbc1-47a4-ff21-aef410a10bb2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|path |content |xml |\n", + "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|file:/content/xml-files/test.xml |\\n \\n Harry Potter\\n J K. Rowling\\n 2005\\n 29.99\\n \\n \\n Learning XML\\n Erik T. Ray\\n 2003\\n 39.95\\n \\n |[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}]|\n", + "|file:/content/xml-files/multi-level.xml|\\n
\\n \\n \\n The Alchemist\\n Paulo Coelho\\n 1988\\n \\n \\n
\\n
\\n \\n \\n A Brief History of Time\\n Stephen Hawking\\n 1988\\n \\n \\n
\\n
\\n|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}] |\n", + "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"storeContent\": \"true\"}\n", + "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n", + "xml_df.show(truncate=False)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py index dfd865116f3821..86bf5781053050 100644 --- a/python/sparknlp/reader/sparknlp_reader.py +++ b/python/sparknlp/reader/sparknlp_reader.py @@ -322,4 +322,49 @@ def txt(self, docPath): if not isinstance(docPath, str): raise TypeError("docPath must be a string") jdf = self._java_obj.txt(docPath) + return self.getDataFrame(self.spark, jdf) + + def xml(self, docPath): + """Reads XML files and returns a Spark DataFrame. + + Parameters + ---------- + docPath : str + Path to an XML file or a directory containing XML files. + + Returns + ------- + pyspark.sql.DataFrame + A DataFrame containing parsed XML content. + + Examples + -------- + >>> from sparknlp.reader import SparkNLPReader + >>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory") + + You can use SparkNLP for one line of code + + >>> import sparknlp + >>> xml_df = sparknlp.read().xml("home/user/xml-directory") + >>> xml_df.show(truncate=False) + +-----------------------------------------------------------+ + |xml | + +-----------------------------------------------------------+ + |[{Title, John Smith, {elementId -> ..., tag -> title}}] | + +-----------------------------------------------------------+ + + >>> xml_df.printSchema() + root + |-- path: string (nullable = true) + |-- xml: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- elementType: string (nullable = true) + | | |-- content: string (nullable = true) + | | |-- metadata: map (nullable = true) + | | | |-- key: string + | | | |-- value: string (valueContainsNull = true) + """ + if not isinstance(docPath, str): + raise TypeError("docPath must be a string") + jdf = self._java_obj.xml(docPath) return self.getDataFrame(self.spark, jdf) \ No newline at end of file diff --git a/python/test/sparknlp_test.py b/python/test/sparknlp_test.py index 68ea10b36476bf..c2baa14fec213d 100644 --- a/python/test/sparknlp_test.py +++ b/python/test/sparknlp_test.py @@ -125,4 +125,18 @@ def runTest(self): txt_df = sparknlp.read().txt(self.txt_file) txt_df.show() - self.assertTrue(txt_df.select("txt").count() > 0) \ No newline at end of file + self.assertTrue(txt_df.select("txt").count() > 0) + + +@pytest.mark.fast +class SparkNLPTestXMLFilesSpec(unittest.TestCase): + + def setUp(self): + self.data = SparkContextForTest.data + self.xml_files = f"file:///{os.getcwd()}/../src/test/resources/reader/xml" + + def runTest(self): + xml_df = sparknlp.read().xml(self.xml_files) + xml_df.show() + + self.assertTrue(xml_df.select("xml").count() > 0) \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala new file mode 100644 index 00000000000000..4993bc65a8cd8b --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala @@ -0,0 +1,38 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable +import org.apache.spark.ml.param.Param + +trait HasXmlReaderProperties extends ParamsAndFeaturesWritable { + + val xmlKeepTags = new Param[Boolean]( + this, + "xmlKeepTags", + "Whether to include XML tag names as metadata in the output.") + + def setXmlKeepTags(value: Boolean): this.type = set(xmlKeepTags, value) + + val onlyLeafNodes = new Param[Boolean]( + this, + "onlyLeafNodes", + "If true, only processes XML leaf nodes (no nested children).") + + def setOnlyLeafNodes(value: Boolean): this.type = set(onlyLeafNodes, value) + + setDefault(xmlKeepTags -> false, onlyLeafNodes -> true) +} diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala index a339fe9b258ee4..2e6f69b8c5b4c4 100644 --- a/src/main/scala/com/johnsnowlabs/partition/Partition.scala +++ b/src/main/scala/com/johnsnowlabs/partition/Partition.scala @@ -188,6 +188,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) "application/vnd.openxmlformats-officedocument.presentationml.presentation" => sparkNLPReader.ppt case "application/pdf" => sparkNLPReader.pdf + case "application/xml" => sparkNLPReader.xml case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType") } } @@ -199,6 +200,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) case "text/plain" => sparkNLPReader.txtToHTMLElement case "text/html" => sparkNLPReader.htmlToHTMLElement case "url" => sparkNLPReader.urlToHTMLElement + case "application/xml" => sparkNLPReader.xmlToHTMLElement case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType") } } @@ -234,6 +236,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) case "xls" | "xlsx" => sparkNLPReader.xls case "ppt" | "pptx" => sparkNLPReader.ppt case "pdf" => sparkNLPReader.pdf + case "xml" => sparkNLPReader.xml case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension") } } diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala index 73d461c91aaafc..281af53931d72c 100644 --- a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala @@ -86,6 +86,7 @@ class PartitionTransformer(override val uid: String) with HasPowerPointProperties with HasTextReaderProperties with HasPdfProperties + with HasXmlReaderProperties with HasChunkerProperties { def this() = this(Identifiable.randomUID("PartitionTransformer")) @@ -157,7 +158,9 @@ class PartitionTransformer(override val uid: String) "newAfterNChars" -> $(newAfterNChars).toString, "overlap" -> $(overlap).toString, "combineTextUnderNChars" -> $(combineTextUnderNChars).toString, - "overlapAll" -> $(overlapAll).toString) + "overlapAll" -> $(overlapAll).toString, + "xmlKeepTags" -> $(xmlKeepTags).toString, + "onlyLeafNodes" -> $(onlyLeafNodes).toString) val partitionInstance = new Partition(params.asJava) val inputColum = if (get(inputCols).isDefined) { diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala index a1637116cb7905..216492876cc718 100644 --- a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala @@ -296,7 +296,6 @@ class SparkNLPReader( * |-- width_dimension: integer (nullable = true) * |-- content: binary (nullable = true) * |-- exception: string (nullable = true) - * |-- pagenum: integer (nullable = true) * }}} * * @param params @@ -642,4 +641,69 @@ class SparkNLPReader( default = BLOCK_SPLIT_PATTERN) } + /** Instantiates class to read XML files. + * + * xmlPath: this is a path to a directory of XML files or a path to an XML file. E.g., + * "path/xml/files" + * + * ==Example== + * {{{ + * val xmlPath = "home/user/xml-directory" + * val sparkNLPReader = new SparkNLPReader() + * val xmlDf = sparkNLPReader.xml(xmlPath) + * }}} + * + * ==Example 2== + * You can use SparkNLP for one line of code + * {{{ + * val xmlDf = SparkNLP.read.xml(xmlPath) + * }}} + * + * {{{ + * xmlDf.select("xml").show(false) + * +------------------------------------------------------------------------------------------------------------------------+ + * |xml | + * +------------------------------------------------------------------------------------------------------------------------+ + * |[{Title, John Smith, {elementId -> ..., tag -> title}}, {UncategorizedText, Some content..., {elementId -> ...}}] | + * +------------------------------------------------------------------------------------------------------------------------+ + * + * xmlDf.printSchema() + * root + * |-- path: string (nullable = true) + * |-- xml: array (nullable = true) + * | |-- element: struct (containsNull = true) + * | | |-- elementType: string (nullable = true) + * | | |-- content: string (nullable = true) + * | | |-- metadata: map (nullable = true) + * | | | |-- key: string + * | | | |-- value: string (valueContainsNull = true) + * }}} + * + * @param xmlPath + * Path to the XML file or directory + * @return + * A DataFrame with parsed XML as structured elements + */ + + def xml(xmlPath: String): DataFrame = { + val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes) + xmlReader.read(xmlPath) + } + + def xmlToHTMLElement(xml: String): Seq[HTMLElement] = { + val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes) + xmlReader.parseXml(xml) + } + + private def getXmlKeepTags: Boolean = { + getDefaultBoolean(params.asScala.toMap, Seq("xmlKeepTags", "xml_keep_tags"), default = false) + } + + private def getOnlyLeafNodes: Boolean = { + getDefaultBoolean( + params.asScala.toMap, + Seq("onlyLeafNodes", "only_leaf_nodes"), + default = true) + } + } diff --git a/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala new file mode 100644 index 00000000000000..fc777458dafb83 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala @@ -0,0 +1,150 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.nlp.util.io.ResourceHelper.validFile +import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, udf} + +import scala.collection.mutable +import scala.collection.mutable.ListBuffer +import scala.xml.{Elem, Node, XML} + +/** Class to parse and read XML files. + * + * @param storeContent + * Whether to include the raw XML content in the resulting DataFrame as a separate 'content' + * column. By default, this is false. + * + * @param xmlKeepTags + * Whether to retain original XML tag names and include them in the metadata for each extracted + * element. Useful for preserving structure. Default is false. + * + * @param onlyLeafNodes + * If true, only the deepest elements (those without child elements) are extracted. If false, + * all elements are extracted. Default is true. + * + * ==Input Format== + * Input must be a valid path to an XML file or a directory containing XML files. + * + * ==Example== + * {{{ + * val xmlPath = "./data/sample.xml" + * val xmlReader = new XMLReader() + * val xmlDf = xmlReader.read(xmlPath) + * }}} + * + * {{{ + * xmlDf.show(truncate = false) + * +----------------------+--------------------------------------------------+ + * |path |xml | + * +----------------------+--------------------------------------------------+ + * |file:/data/sample.xml |[{Title, My Book, {tag -> title}}, ...] | + * +----------------------+--------------------------------------------------+ + * + * xmlDf.printSchema() + * root + * |-- path: string (nullable = true) + * |-- xml: array (nullable = true) + * | |-- element: struct (containsNull = true) + * | | |-- elementType: string (nullable = true) + * | | |-- content: string (nullable = true) + * | | |-- metadata: map (nullable = true) + * | | | |-- key: string + * | | | |-- value: string (valueContainsNull = true) + * }}} + * + * For more examples refer to: + * [[https://github.com/JohnSnowLabs/spark-nlp/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb notebook]] + */ +class XMLReader( + storeContent: Boolean = false, + xmlKeepTags: Boolean = false, + onlyLeafNodes: Boolean = true) + extends Serializable { + + private lazy val spark = ResourceHelper.spark + + private var outputColumn = "xml" + + def setOutputColumn(value: String): this.type = { + require(value.nonEmpty, "Output column name cannot be empty.") + outputColumn = value + this + } + + def read(inputSource: String): DataFrame = { + if (validFile(inputSource)) { + val xmlDf = datasetWithTextFile(spark, inputSource) + .withColumn(outputColumn, parseXmlUDF(col("content"))) + if (storeContent) xmlDf.select("path", "content", outputColumn) + else xmlDf.select("path", outputColumn) + } else throw new IllegalArgumentException(s"Invalid inputSource: $inputSource") + } + + private val parseXmlUDF = udf((xml: String) => { + parseXml(xml) + }) + + def parseXml(xmlString: String): List[HTMLElement] = { + val xml = XML.loadString(xmlString) + val elements = ListBuffer[HTMLElement]() + + def traverse(node: Node, parentId: Option[String]): Unit = { + node match { + case elem: Elem => + val tagName = elem.label.toLowerCase + val textContent = elem.text.trim + val elementId = hash(tagName + textContent) + + val isLeaf = !elem.child.exists(_.isInstanceOf[Elem]) + + if (!onlyLeafNodes || isLeaf) { + val elementType = tagName match { + case "title" | "author" => ElementType.TITLE + case _ => ElementType.UNCATEGORIZED_TEXT + } + + val metadata = mutable.Map[String, String]("elementId" -> elementId) + if (xmlKeepTags) metadata += ("tag" -> tagName) + parentId.foreach(id => metadata += ("parentId" -> id)) + + val content = if (isLeaf) textContent else "" + elements += HTMLElement(elementType, content, metadata) + } + + // Traverse children + elem.child.foreach(traverse(_, Some(elementId))) + + case _ => // Ignore other types + } + } + + traverse(xml, None) + elements.toList + } + + def hash(s: String): String = { + java.security.MessageDigest + .getInstance("MD5") + .digest(s.getBytes) + .map("%02x".format(_)) + .mkString + } + +} diff --git a/src/test/resources/reader/xml/multi-level.xml b/src/test/resources/reader/xml/multi-level.xml new file mode 100644 index 00000000000000..e14e5ad684be30 --- /dev/null +++ b/src/test/resources/reader/xml/multi-level.xml @@ -0,0 +1,20 @@ + +
+ + + The Alchemist + Paulo Coelho + 1988 + + +
+
+ + + A Brief History of Time + Stephen Hawking + 1988 + + +
+
diff --git a/src/test/resources/reader/xml/test.xml b/src/test/resources/reader/xml/test.xml new file mode 100644 index 00000000000000..44bdab910b4c96 --- /dev/null +++ b/src/test/resources/reader/xml/test.xml @@ -0,0 +1,14 @@ + + + Harry Potter + J K. Rowling + 2005 + 29.99 + + + Learning XML + Erik T. Ray + 2003 + 39.95 + + \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala index 9937b95f59e512..05c5916c843424 100644 --- a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala +++ b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala @@ -32,6 +32,7 @@ class PartitionTest extends AnyFlatSpec { val emailDirectory = "src/test/resources/reader/email" val htmlDirectory = "src/test/resources/reader/html" val pdfDirectory = "src/test/resources/reader/pdf" + val xmlDirectory = "src/test/resources/reader/xml" "Partition" should "work with text content_type" taggedAs FastTest in { val textDf = Partition(Map("content_type" -> "text/plain")).partition(txtDirectory) @@ -181,4 +182,11 @@ class PartitionTest extends AnyFlatSpec { assert(elements == expectedElements) } + it should "work with XML content_type" taggedAs FastTest in { + val pdfDf = Partition(Map("content_type" -> "application/xml")).partition(xmlDirectory) + pdfDf.show() + + assert(!pdfDf.select(col("xml")).isEmpty) + } + } diff --git a/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala new file mode 100644 index 00000000000000..a75537803e61de --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala @@ -0,0 +1,43 @@ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.tags.FastTest +import org.apache.spark.sql.functions.{array_contains, col, explode, map_keys} +import org.scalatest.flatspec.AnyFlatSpec + +class XMLReaderTest extends AnyFlatSpec { + + val xmlFilesDirectory = "./src/test/resources/reader/xml/" + + "XMLReader" should "read xml as dataframe" taggedAs FastTest in { + val XMLReader = new XMLReader() + val xmlDF = XMLReader.read(s"$xmlFilesDirectory/test.xml") + xmlDF.show(truncate = false) + + assert(!xmlDF.select(col("xml").getItem(0)).isEmpty) + assert(!xmlDF.columns.contains("content")) + } + + it should "include tags in the output" taggedAs FastTest in { + val XMLReader = new XMLReader(xmlKeepTags = true) + val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml") + xmlDF.show(truncate = false) + + val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml"))) + val tagsDf = explodedDf.filter(col("xml_exploded.metadata")("tag") =!= "") + + assert(tagsDf.count() > 0) + } + + it should "output all nodes" taggedAs FastTest in { + val XMLReader = new XMLReader(onlyLeafNodes = false) + val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml") + xmlDF.show(truncate = false) + val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml"))) + + val noParentIdCount = explodedDf + .filter(!array_contains(map_keys(col("xml_exploded.metadata")), "parentId")) + + assert(noParentIdCount.count() > 0) + } + +}