diff --git a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb
index c5eecc20945f3f..659bdbc309f127 100644
--- a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb
+++ b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb
@@ -20,8 +20,30 @@
"## Setup and Initialization\n",
"Let's keep in mind a few things before we start 😊\n",
"\n",
- "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release.\n",
- "\n",
+ "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- Let's install and setup Spark NLP in Google Colab\n",
+ "- This part is pretty easy via our simple script"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
"For local files example we will download different files from Spark NLP Github repo:"
]
},
@@ -42,34 +64,34 @@
"base_uri": "https://localhost:8080/"
},
"id": "bo7s-jZVrE7W",
- "outputId": "e7234d36-765e-4a29-f922-02ceab1626dd"
+ "outputId": "b0e91448-3b2c-4dab-84c7-5e7d8bad0be5"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "--2025-05-26 23:11:05-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n",
+ "--2025-06-09 22:10:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 2456707 (2.3M) [text/plain]\n",
"Saving to: ‘html-files/example-10k.html’\n",
"\n",
- "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.05s \n",
+ "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.04s \n",
"\n",
- "2025-05-26 23:11:06 (45.1 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n",
+ "2025-06-09 22:10:23 (52.9 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n",
"\n",
- "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "--2025-06-09 22:10:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 665 [text/plain]\n",
"Saving to: ‘html-files/fake-html.html’\n",
"\n",
"fake-html.html 100%[===================>] 665 --.-KB/s in 0s \n",
"\n",
- "2025-05-26 23:11:06 (30.2 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n",
+ "2025-06-09 22:10:24 (18.3 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n",
"\n"
]
}
@@ -97,38 +119,36 @@
"base_uri": "https://localhost:8080/"
},
"id": "ya8qZe00dalC",
- "outputId": "ba520f44-c4b9-45b1-f03c-6a8e3a33320b"
+ "outputId": "9b4fbf52-9ecc-454b-bef1-0ce31dadb7c7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
+ "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 15629 (15K) [application/octet-stream]\n",
"Saving to: ‘pdf-files/image_3_pages.pdf’\n",
"\n",
- "\r",
- "image_3_pages.pdf 0%[ ] 0 --.-KB/s \r",
"image_3_pages.pdf 100%[===================>] 15.26K --.-KB/s in 0.001s \n",
"\n",
- "2025-05-26 23:11:06 (25.5 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n",
+ "2025-06-09 22:10:24 (24.3 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n",
"\n",
- "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n",
+ "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 25803 (25K) [application/octet-stream]\n",
"Saving to: ‘pdf-files/pdf-title.pdf’\n",
"\n",
- "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0s \n",
+ "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0.001s \n",
"\n",
- "2025-05-26 23:11:06 (58.5 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n",
+ "2025-06-09 22:10:24 (21.2 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n",
"\n",
- "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n",
+ "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
@@ -137,7 +157,7 @@
"\n",
"text_3_pages.pdf 100%[===================>] 9.26K --.-KB/s in 0s \n",
"\n",
- "2025-05-26 23:11:07 (79.2 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n",
+ "2025-06-09 22:10:24 (73.3 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n",
"\n"
]
}
@@ -166,47 +186,45 @@
"base_uri": "https://localhost:8080/"
},
"id": "zLLEUl3KpYZ6",
- "outputId": "4346e6e1-18ec-47a8-92c0-c8bc588f3441"
+ "outputId": "407e9405-6cc9-4724-f576-f52c503cb52d"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n",
+ "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 95087 (93K) [application/octet-stream]\n",
"Saving to: ‘word-files/contains-pictures.docx’\n",
"\n",
- "\r",
- "contains-pictures.d 0%[ ] 0 --.-KB/s \r",
- "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.01s \n",
+ "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.02s \n",
"\n",
- "2025-05-26 23:11:07 (6.85 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n",
+ "2025-06-09 22:10:25 (4.74 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n",
"\n",
- "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 12392 (12K) [application/octet-stream]\n",
"Saving to: ‘word-files/fake_table.docx’\n",
"\n",
"fake_table.docx 100%[===================>] 12.10K --.-KB/s in 0.001s \n",
"\n",
- "2025-05-26 23:11:07 (17.7 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n",
+ "2025-06-09 22:10:25 (18.9 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n",
"\n",
- "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 14584 (14K) [application/octet-stream]\n",
"Saving to: ‘word-files/page-breaks.docx’\n",
"\n",
"page-breaks.docx 100%[===================>] 14.24K --.-KB/s in 0.001s \n",
"\n",
- "2025-05-26 23:11:08 (22.4 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n",
+ "2025-06-09 22:10:25 (21.5 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n",
"\n"
]
}
@@ -235,48 +253,58 @@
"base_uri": "https://localhost:8080/"
},
"id": "G3-BCYP6qQ4x",
- "outputId": "38489a6e-588d-4a1b-e319-0c7f66559ca0"
+ "outputId": "95c5a31d-eed9-47a1-bb55-0868daec7da7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 12541 (12K) [application/octet-stream]\n",
"Saving to: ‘excel-files/vodafone.xlsx’\n",
"\n",
"\r",
"vodafone.xlsx 0%[ ] 0 --.-KB/s \r",
- "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0.001s \n",
+ "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0s \n",
"\n",
- "2025-05-26 23:11:08 (22.2 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n",
+ "2025-06-09 22:10:26 (30.4 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n",
"\n",
- "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n",
+ "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 38442 (38K) [application/octet-stream]\n",
"Saving to: ‘excel-files/2023-half-year-analyses-by-segment.xlsx’\n",
"\n",
- "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.007s \n",
+ "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.01s \n",
"\n",
- "2025-05-26 23:11:08 (5.37 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n",
+ "2025-06-09 22:10:26 (3.43 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n",
"\n",
- "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n",
+ "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
- "HTTP request sent, awaiting response... 404 Not Found\n",
- "2025-05-26 23:11:09 ERROR 404: Not Found.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 10676 (10K) [application/octet-stream]\n",
+ "Saving to: ‘excel-files/page-break-example.xlsx’\n",
+ "\n",
+ "page-break-example. 100%[===================>] 10.43K --.-KB/s in 0s \n",
"\n",
- "--2025-05-26 23:11:09-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n",
+ "2025-06-09 22:10:26 (79.4 MB/s) - ‘excel-files/page-break-example.xlsx’ saved [10676/10676]\n",
+ "\n",
+ "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
- "HTTP request sent, awaiting response... 404 Not Found\n",
- "2025-05-26 23:11:09 ERROR 404: Not Found.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 9210 (9.0K) [application/octet-stream]\n",
+ "Saving to: ‘excel-files/xlsx-subtable-cases.xlsx’\n",
+ "\n",
+ "xlsx-subtable-cases 100%[===================>] 8.99K --.-KB/s in 0s \n",
+ "\n",
+ "2025-06-09 22:10:26 (65.5 MB/s) - ‘excel-files/xlsx-subtable-cases.xlsx’ saved [9210/9210]\n",
"\n"
]
}
@@ -289,17 +317,6 @@
"!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx -P excel-files"
]
},
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "id": "LcSYn6q7jW9-"
- },
- "outputs": [],
- "source": [
- "!cp drive/MyDrive/JSL/PageBreakExample.xlsx ./excel-files"
- ]
- },
{
"cell_type": "markdown",
"metadata": {
@@ -317,42 +334,45 @@
"base_uri": "https://localhost:8080/"
},
"id": "1jDRFmcHqpxn",
- "outputId": "4d59c445-3764-41a8-c91b-9231d401eac6"
+ "outputId": "cd7e3c96-bb5f-49ab-f466-56ec6be20f75"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "--2025-05-26 23:11:09-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "--2025-06-09 22:10:27-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 38412 (38K) [application/octet-stream]\n",
"Saving to: ‘ppt-files/fake-power-point.pptx’\n",
"\n",
- "\r",
- "fake-power-point.pp 0%[ ] 0 --.-KB/s \r",
- "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.007s \n",
+ "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.01s \n",
"\n",
- "2025-05-26 23:11:10 (5.29 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n",
+ "2025-06-09 22:10:27 (3.41 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n",
"\n",
- "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n",
+ "--2025-06-09 22:10:27-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 39894 (39K) [application/octet-stream]\n",
"Saving to: ‘ppt-files/fake-power-point-table.pptx’\n",
"\n",
- "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.006s \n",
+ "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.008s \n",
"\n",
- "2025-05-26 23:11:10 (6.73 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n",
+ "2025-06-09 22:10:28 (4.93 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n",
"\n",
- "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n",
+ "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
- "HTTP request sent, awaiting response... 404 Not Found\n",
- "2025-05-26 23:11:10 ERROR 404: Not Found.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 39414 (38K) [application/octet-stream]\n",
+ "Saving to: ‘ppt-files/speaker-notes.pptx’\n",
+ "\n",
+ "speaker-notes.pptx 100%[===================>] 38.49K --.-KB/s in 0.008s \n",
+ "\n",
+ "2025-06-09 22:10:28 (4.76 MB/s) - ‘ppt-files/speaker-notes.pptx’ saved [39414/39414]\n",
"\n"
]
}
@@ -381,14 +401,14 @@
"base_uri": "https://localhost:8080/"
},
"id": "yYMVpVQurk7G",
- "outputId": "cedb0e39-f137-4759-a158-0b84ed31b282"
+ "outputId": "293a864a-2980-4502-c6dc-a1d3cee815ee"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n",
+ "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
@@ -399,18 +419,18 @@
" email-tex 0%[ ] 0 --.-KB/s \r",
"email-text-attachme 100%[===================>] 3.10K --.-KB/s in 0s \n",
"\n",
- "2025-05-26 23:11:11 (49.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n",
+ "2025-06-09 22:10:28 (21.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n",
"\n",
- "--2025-05-26 23:11:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
+ "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1324361 (1.3M) [text/plain]\n",
"Saving to: ‘email-files/test-several-attachments.eml’\n",
"\n",
"test-several-attach 100%[===================>] 1.26M --.-KB/s in 0.04s \n",
"\n",
- "2025-05-26 23:11:11 (32.0 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n",
+ "2025-06-09 22:10:29 (30.2 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n",
"\n"
]
}
@@ -438,14 +458,14 @@
"base_uri": "https://localhost:8080/"
},
"id": "AV-krG6Ps8pq",
- "outputId": "c407a77f-11d5-4a3c-85e0-4abffa48bd12"
+ "outputId": "bd7317e0-97d3-4f30-a800-6ffa8148f266"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "--2025-05-26 23:11:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n",
+ "--2025-06-09 22:10:29-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
@@ -456,7 +476,7 @@
"simple-text.txt 0%[ ] 0 --.-KB/s \r",
"simple-text.txt 100%[===================>] 300 --.-KB/s in 0s \n",
"\n",
- "2025-05-26 23:11:11 (4.81 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n",
+ "2025-06-09 22:10:29 (3.39 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n",
"\n"
]
}
@@ -466,6 +486,51 @@
"!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QVq5C0Uqs4wU"
+ },
+ "source": [
+ "**Downloading XML files**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Gip5P7Ess63U",
+ "outputId": "dde0fa15-2571-4b4a-ef73-517fe2b7a7a7"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2025-06-09 22:15:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 538 [text/plain]\n",
+ "Saving to: ‘xml-files/multi-level.xml’\n",
+ "\n",
+ "\r",
+ "multi-level.xml 0%[ ] 0 --.-KB/s \r",
+ "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n",
+ "\n",
+ "2025-06-09 22:15:15 (21.2 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "!mkdir xml-files\n",
+ "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
@@ -478,13 +543,13 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 16,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bAkMjJ1vdalE",
- "outputId": "15401bcc-3cb2-474a-d771-0efed1eaf9cd"
+ "outputId": "582dcc26-76ea-4cac-c5f6-46e009b639f9"
},
"outputs": [
{
@@ -519,13 +584,13 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 17,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VWbUgoVQrO8m",
- "outputId": "36bbf310-7ee5-474a-93f2-4d940d3c0547"
+ "outputId": "56f4f9ce-41bb-48ba-b5db-7e1bde47d8d8"
},
"outputs": [
{
@@ -558,13 +623,13 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 18,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YFzeGJJ3ICVM",
- "outputId": "01c349aa-16d2-4e0d-8a30-11399caf2ef2"
+ "outputId": "fc9bc68c-2b20-479e-8fe8-3e380877cebf"
},
"outputs": [
{
@@ -597,13 +662,13 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 19,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "y_xl0ahaJ0Hy",
- "outputId": "6040b119-2eca-4c58-f51b-e20fbefeef8d"
+ "outputId": "327222b8-0c6b-4578-8fde-4f14f9835edc"
},
"outputs": [
{
@@ -636,13 +701,13 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 20,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4JnKvFe5KVDf",
- "outputId": "d91d1ee5-d4a3-48a1-b40a-d5f6bf997025"
+ "outputId": "c9252fb7-3840-4c95-d461-a56eef9adaea"
},
"outputs": [
{
@@ -675,13 +740,13 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 21,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_sldwjppKoPl",
- "outputId": "467e9085-86dd-43df-f63b-a707b920d3b3"
+ "outputId": "0619383d-abf4-43a6-f63d-ad81897f8d9e"
},
"outputs": [
{
@@ -714,13 +779,13 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 22,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GltbZAjmKwQs",
- "outputId": "c3f18b1f-06df-4233-8874-e9702c465e69"
+ "outputId": "df9ae11b-0186-4e61-d6ff-9581c597ccd1"
},
"outputs": [
{
@@ -731,9 +796,9 @@
"+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
"| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n",
"+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n",
"+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
"\n"
]
@@ -798,7 +863,9 @@
"| `infer_table_structure` | Word, Excel, PowerPoint | Whether to generate an HTML table representation from structured table content. When enabled, a full `
` element is added alongside cell-level elements, based on row and column layout. |\n",
"| `append_cells` | Excel | Whether to append all rows into a single content block instead of creating separate elements per row. |\n",
"| `cell_separator` | Excel | String used to join cell values in a row when assembling textual output |\n",
- "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |"
+ "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |\n",
+ "| `xml_keep_tags` | XML | Whether to retain original XML tag names and include them in the metadata for each extracted element |\n",
+ "| `only_leaf_nodes` | XML | If true, only the deepest elements are extracted. If false, all elements are extracted|"
]
},
{
@@ -812,13 +879,13 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 23,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gDJyUi_9R4fr",
- "outputId": "4aebe625-444d-4161-be23-512708ced1b5"
+ "outputId": "181d8e88-7a0b-4a6e-f497-7fd4add3726c"
},
"outputs": [
{
@@ -830,8 +897,8 @@
"| path| doc|\n",
"+--------------------+--------------------+\n",
"|file:/content/wor...|[{NarrativeText, ...|\n",
- "|file:/content/wor...|[{Header, An inli...|\n",
"|file:/content/wor...|[{Table, Header C...|\n",
+ "|file:/content/wor...|[{Header, An inli...|\n",
"+--------------------+--------------------+\n",
"\n"
]
@@ -843,50 +910,23 @@
]
},
{
- "cell_type": "code",
- "execution_count": 23,
+ "cell_type": "markdown",
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "3vz48AHQHyON",
- "outputId": "f3ba8c4b-3bfc-453a-d8d4-f86a5fca0a1b"
+ "id": "F0lCz9OyPYYh"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Warning::Spark Session already created, some configs may not take.\n",
- "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
- "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n",
- "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 25803|This is a Title \\...| 842| 596| NULL| NULL| 0|\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 0|\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 1|\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 2|\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n",
- "|file:/content/pdf...|2025-05-26 23:11:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n",
- "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
- "\n"
- ]
- }
- ],
"source": [
- "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n",
- "partition_df.show()"
+ "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output"
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 26,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "S50lqIFskNO3",
- "outputId": "e52f4cde-cfb9-4a55-d989-6e9fe40a0321"
+ "id": "qExdRJ2aPsYV",
+ "outputId": "9a033a02-4bae-4570-aaba-b81c23b8e0e1"
},
"outputs": [
{
@@ -894,38 +934,40 @@
"output_type": "stream",
"text": [
"Warning::Spark Session already created, some configs may not take.\n",
- "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|path |xls |\n",
- "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|file:/content/excel-files/PageBreakExample.xlsx|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}]|\n",
- "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "+--------------------+--------------------+--------------------+\n",
+ "| path| doc| content|\n",
+ "+--------------------+--------------------+--------------------+\n",
+ "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n",
+ "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n",
+ "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n",
+ "+--------------------+--------------------+--------------------+\n",
"\n"
]
}
],
"source": [
- "partition_df = Partition(content_type = \"application/vnd.ms-excel\").partition(\"./excel-files/PageBreakExample.xlsx\")\n",
- "partition_df.show(truncate=False)"
+ "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n",
+ "partition_df.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
- "id": "F0lCz9OyPYYh"
+ "id": "E3bCFJZn8TS0"
},
"source": [
- "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output"
+ "## Partitioning PDF Files"
]
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 24,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "qExdRJ2aPsYV",
- "outputId": "0284de34-ce6a-4d1e-91bc-268521111015"
+ "id": "3vz48AHQHyON",
+ "outputId": "19369e63-f963-4422-a791-57ea5394df1a"
},
"outputs": [
{
@@ -933,19 +975,23 @@
"output_type": "stream",
"text": [
"Warning::Spark Session already created, some configs may not take.\n",
- "+--------------------+--------------------+--------------------+\n",
- "| path| doc| content|\n",
- "+--------------------+--------------------+--------------------+\n",
- "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n",
- "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n",
- "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n",
- "+--------------------+--------------------+--------------------+\n",
+ "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
+ "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n",
+ "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 25803|This is a Title \\...| 842| 596| NULL| NULL| 0|\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 0|\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 1|\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 2|\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n",
+ "|file:/content/pdf...|2025-06-09 22:10:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n",
+ "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
"\n"
]
}
],
"source": [
- "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n",
+ "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n",
"partition_df.show()"
]
},
@@ -969,13 +1015,13 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 27,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_2J0zEmma8jm",
- "outputId": "405391bf-60bf-4632-ef0e-e84496049c71"
+ "outputId": "90f668d7-03d9-496f-dc82-a620c59f9c08"
},
"outputs": [
{
@@ -1018,13 +1064,13 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 28,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4sY2ADN8dusy",
- "outputId": "98af2c82-8a55-46ff-f631-7775431820cb"
+ "outputId": "8164237e-6835-404a-d7a7-b5ef0ef99c6d"
},
"outputs": [
{
@@ -1046,24 +1092,33 @@
"partition_df.show(truncate=False)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "uMyqJX-K7dss"
+ },
+ "source": [
+ "## Partitioning MS Office documents"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
"id": "_9dDTCrpGdoN"
},
"source": [
- "For Word documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output."
+ "For Excel documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output."
]
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 29,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7ICTZmLGk3Sa",
- "outputId": "5e31a551-2746-4c45-b933-56f55e4866c9"
+ "outputId": "1796055a-808c-4eff-fc86-14e29cf9b53e"
},
"outputs": [
{
@@ -1087,13 +1142,13 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 30,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YId4UG1rOVQq",
- "outputId": "7de8b4be-9936-4330-8a0f-019c3a55182a"
+ "outputId": "32827dea-d7b3-4137-abff-9e4502f8cd93"
},
"outputs": [
{
@@ -1118,38 +1173,21 @@
{
"cell_type": "markdown",
"metadata": {
- "id": "jpRmFNPNNqkf"
- },
- "source": [
- "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {
- "id": "HwnYBQ5l7rDM"
+ "id": "E8ockED4NxLi"
},
- "outputs": [],
"source": [
- "text = (\n",
- " \"The big brown fox\\n\"\n",
- " \"was walking down the lane.\\n\"\n",
- " \"\\n\"\n",
- " \"At the end of the lane,\\n\"\n",
- " \"the fox met a bear.\"\n",
- " )"
+ "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output."
]
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 34,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "mutwZUFj720X",
- "outputId": "87cd31c5-2f94-4777-9ea5-b6edf8277347"
+ "id": "fPCpk7RTGRjo",
+ "outputId": "a818ecd7-8580-4098-b30f-6e46b8ef6baa"
},
"outputs": [
{
@@ -1157,61 +1195,77 @@
"output_type": "stream",
"text": [
"Warning::Spark Session already created, some configs may not take.\n",
- "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|txt |\n",
- "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n",
- "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|path |ppt |\n",
+ "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n",
+ "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
"\n"
]
}
],
"source": [
- "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n",
- "text_df.show(truncate=False)"
+ "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n",
+ "partition_df.show(truncate=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
- "id": "E8ockED4NxLi"
+ "id": "qRfRSGvhN303"
},
"source": [
- "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output."
+ "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display."
]
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 35,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "xF8F-5CP3qWY",
- "outputId": "71b5e0cb-b22a-4774-a7b6-83c4fd67fadb"
+ "id": "twLdjGxZWiOJ",
+ "outputId": "8adcaa80-b02c-4e8f-8205-20efa8c40b4b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "fake-power-point.pptx fake-power-point-table.pptx\n"
+ "Warning::Spark Session already created, some configs may not take.\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|xls |\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML, | Date | Fri Jul 19 00:00:00 UTC 2024 |
| Assets | Debts |
| Bank1 | 5865.43 | Credit Card1 | 2000.0 |
| Bank2 | 10140.19 | Credit Card2 | 1500.0 |
| Bank3 | 1200.0 | Credit Card3 | 348.0 |
| Bank4 | 1438.27 | Total | SUM(F3:F5) |
| Total | SUM(B3:B6) |
, {SheetName -> Sheet1}}]|\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "\n"
]
}
],
"source": [
- "!ls ppt-files"
+ "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n",
+ "partition_df.select(\"xls\").show(truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8txswwbjN8Mg"
+ },
+ "source": [
+ "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually."
]
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 36,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "fPCpk7RTGRjo",
- "outputId": "74144c26-5060-4c99-f291-a097b838e774"
+ "id": "PQ4MpGw6xCko",
+ "outputId": "aaf807a7-27b9-40cc-8a75-58be077f8403"
},
"outputs": [
{
@@ -1219,38 +1273,64 @@
"output_type": "stream",
"text": [
"Warning::Spark Session already created, some configs may not take.\n",
- "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|path |ppt |\n",
- "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n",
- "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|xls |\n",
+ "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n",
+ "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
"\n"
]
}
],
"source": [
- "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n",
- "partition_df.show(truncate=False)"
+ "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n",
+ "partition_df.select(\"xls\").show(truncate=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
- "id": "qRfRSGvhN303"
+ "id": "_GyL6D4N75i-"
},
"source": [
- "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display."
+ "## Partitioning Text Files"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jpRmFNPNNqkf"
+ },
+ "source": [
+ "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "id": "HwnYBQ5l7rDM"
+ },
+ "outputs": [],
+ "source": [
+ "text = (\n",
+ " \"The big brown fox\\n\"\n",
+ " \"was walking down the lane.\\n\"\n",
+ " \"\\n\"\n",
+ " \"At the end of the lane,\\n\"\n",
+ " \"the fox met a bear.\"\n",
+ " )"
]
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 32,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "twLdjGxZWiOJ",
- "outputId": "ec340358-7279-4247-b27c-5a0a25f38ee6"
+ "id": "mutwZUFj720X",
+ "outputId": "8b4f474d-2f3f-4e81-cecf-5de420561124"
},
"outputs": [
{
@@ -1258,38 +1338,47 @@
"output_type": "stream",
"text": [
"Warning::Spark Session already created, some configs may not take.\n",
- "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|xls |\n",
- "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML, | Date | Fri Jul 19 00:00:00 UTC 2024 |
| Assets | Debts |
| Bank1 | 5865.43 | Credit Card1 | 2000.0 |
| Bank2 | 10140.19 | Credit Card2 | 1500.0 |
| Bank3 | 1200.0 | Credit Card3 | 348.0 |
| Bank4 | 1438.27 | Total | SUM(F3:F5) |
| Total | SUM(B3:B6) |
, {SheetName -> Sheet1}}]|\n",
- "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|txt |\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
"\n"
]
}
],
"source": [
- "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n",
- "partition_df.select(\"xls\").show(truncate=False)"
+ "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n",
+ "text_df.show(truncate=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
- "id": "8txswwbjN8Mg"
+ "id": "epCp5DnQ8E7o"
},
"source": [
- "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually."
+ "## Partitioning XML Files"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "DWX0nkc4tM7J"
+ },
+ "source": [
+ "In Spark NLP 6.0.3 we added support for XML files"
]
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 45,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
- "id": "PQ4MpGw6xCko",
- "outputId": "808783d2-f15b-45ae-90fb-a623243898f3"
+ "id": "AViMSzKQtP-o",
+ "outputId": "147a1ef9-3f14-4832-a050-e60c8ac9544b"
},
"outputs": [
{
@@ -1297,18 +1386,18 @@
"output_type": "stream",
"text": [
"Warning::Spark Session already created, some configs may not take.\n",
- "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|xls |\n",
- "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
- "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n",
- "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|xml |\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}]|\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
"\n"
]
}
],
"source": [
- "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n",
- "partition_df.select(\"xls\").show(truncate=False)"
+ "partition_df = Partition(xml_keep_tags = True).partition(\"./xml-files/multi-level.xml\")\n",
+ "partition_df.select(\"xml\").show(truncate=False)"
]
}
],
diff --git a/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb
new file mode 100644
index 00000000000000..38b43aed37b95e
--- /dev/null
+++ b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb
@@ -0,0 +1,339 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tzcU5p2gdak9"
+ },
+ "source": [
+ "# Introducing XML reader in SparkNLP\n",
+ "This notebook showcases the newly added `sparknlp.read().xml()` method in Spark NLP that parses XML content from both local files and real-time URLs into a Spark DataFrame.\n",
+ "\n",
+ "**Key Features:**\n",
+ "- Ability to parse XML from local directories and URLs.\n",
+ "- Versatile support for varied data ingestion scenarios."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RFOFhaEedalB"
+ },
+ "source": [
+ "## Setup and Initialization\n",
+ "Let's keep in mind a few things before we start 😊\n",
+ "\n",
+ "Support for reading xml files was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Y3hWfT5q-npM"
+ },
+ "source": [
+ "- Let's install and setup Spark NLP in Google Colab\n",
+ "- This part is pretty easy via our simple script"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "u3ORYVyb-pRI"
+ },
+ "outputs": [],
+ "source": [
+ "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "oIbFQyEo-tat"
+ },
+ "source": [
+ "For local files example we will download a couple of XML files from Spark NLP Github repo:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ya8qZe00dalC",
+ "outputId": "7d597910-9826-4472-9fdc-5b8ac398e6cf"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2025-06-09 21:43:40-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 538 [text/plain]\n",
+ "Saving to: ‘xml-files/multi-level.xml’\n",
+ "\n",
+ "\r",
+ "multi-level.xml 0%[ ] 0 --.-KB/s \r",
+ "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n",
+ "\n",
+ "2025-06-09 21:43:40 (34.0 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n",
+ "\n",
+ "--2025-06-09 21:43:40-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/test.xml\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 382 [text/plain]\n",
+ "Saving to: ‘xml-files/test.xml’\n",
+ "\n",
+ "test.xml 100%[===================>] 382 --.-KB/s in 0s \n",
+ "\n",
+ "2025-06-09 21:43:40 (7.58 MB/s) - ‘xml-files/test.xml’ saved [382/382]\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "!mkdir xml-files\n",
+ "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files\n",
+ "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/test.xml -P xml-files"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "EoFI66NAdalE"
+ },
+ "source": [
+ "## Parsing XML from Local Files\n",
+ "Use the `xml()` method to parse XML content from local directories."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "bAkMjJ1vdalE",
+ "outputId": "0bba10be-75de-48de-9a06-d6197d35218f"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Warning::Spark Session already created, some configs may not take.\n",
+ "+--------------------+--------------------+\n",
+ "| path| xml|\n",
+ "+--------------------+--------------------+\n",
+ "|file:/content/xml...|[{Title, Harry Po...|\n",
+ "|file:/content/xml...|[{Title, The Alch...|\n",
+ "+--------------------+--------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sparknlp\n",
+ "xml_df = sparknlp.read().xml(\"./xml-files\")\n",
+ "\n",
+ "xml_df.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "oBj0cHPXSD1m",
+ "outputId": "00951736-40d4-4f9e-fe25-cc5117405269"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "root\n",
+ " |-- path: string (nullable = true)\n",
+ " |-- xml: array (nullable = true)\n",
+ " | |-- element: struct (containsNull = true)\n",
+ " | | |-- elementType: string (nullable = true)\n",
+ " | | |-- content: string (nullable = true)\n",
+ " | | |-- metadata: map (nullable = true)\n",
+ " | | | |-- key: string\n",
+ " | | | |-- value: string (valueContainsNull = true)\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "xml_df.printSchema()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "FrVKxdySz8pR"
+ },
+ "source": [
+ "### Configuration Parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CC_klLwhV8um"
+ },
+ "source": [
+ "`xmlKeepTags`: When true, includes the tag name of each XML element in the metadata under the key `tag`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "aNfN0fQC0Vzz",
+ "outputId": "ebdb1393-b91c-4c60-d7e7-b7ecc6465171"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Warning::Spark Session already created, some configs may not take.\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|xml |\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> title}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> author}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> year}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> price}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> title}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> author}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> year}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> price}}]|\n",
+ "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}] |\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "params = {\"xmlKeepTags\": \"true\"}\n",
+ "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n",
+ "xml_df.select(\"xml\").show(truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "t06KtTItWQ4R"
+ },
+ "source": [
+ "`onlyLeafNodes`: When true, includes only leaf elements (i.e., elements with no child elements) in the output. When false, all elements (including containers) are included."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "jTM1btqNntUL",
+ "outputId": "f86a0b28-73ac-46d1-8d26-f920e2d935cd"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Warning::Spark Session already created, some configs may not take.\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|xml |\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{UncategorizedText, , {elementId -> 931f811d0c9b488a01a7875f80992a62}}, {UncategorizedText, , {elementId -> 1f610d9429ab17d0d7ab49ee3069b4fc, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, , {elementId -> 249aff1b3e9835325b45e51cdfc4ad46, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}] |\n",
+ "|[{UncategorizedText, , {elementId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> d7416d9cac3ba3af57ef6b6b71d7841b, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> b79ae4ca74ec00f63a00b6cd66acc1e0, parentId -> d7416d9cac3ba3af57ef6b6b71d7841b}}, {UncategorizedText, , {elementId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, parentId -> b79ae4ca74ec00f63a00b6cd66acc1e0}}, {Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, , {elementId -> 9ebecf846e7dea80c563ebcb2f7d4a9a, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> 80472cd1880f453b8adecc61870748ba, parentId -> 9ebecf846e7dea80c563ebcb2f7d4a9a}}, {UncategorizedText, , {elementId -> 9708b29025b53d9f54c723ee005b647b, parentId -> 80472cd1880f453b8adecc61870748ba}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}]|\n",
+ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "params = {\"onlyLeafNodes\": \"false\"}\n",
+ "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n",
+ "xml_df.select(\"xml\").show(truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "O8DePUq8nkYm"
+ },
+ "source": [
+ "You can access the raw content of the file using the `storeContent` parameter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "E0S5aRb5WFLf",
+ "outputId": "5e624eeb-fbc1-47a4-ff21-aef410a10bb2"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Warning::Spark Session already created, some configs may not take.\n",
+ "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|path |content |xml |\n",
+ "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|file:/content/xml-files/test.xml |\\n \\n Harry Potter\\n J K. Rowling\\n 2005\\n 29.99\\n \\n \\n Learning XML\\n Erik T. Ray\\n 2003\\n 39.95\\n \\n |[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}]|\n",
+ "|file:/content/xml-files/multi-level.xml|\\n \\n \\n \\n The Alchemist\\n Paulo Coelho\\n 1988\\n \\n \\n \\n \\n \\n \\n A Brief History of Time\\n Stephen Hawking\\n 1988\\n \\n \\n \\n\\n|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}] |\n",
+ "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "params = {\"storeContent\": \"true\"}\n",
+ "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n",
+ "xml_df.show(truncate=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py
index dfd865116f3821..86bf5781053050 100644
--- a/python/sparknlp/reader/sparknlp_reader.py
+++ b/python/sparknlp/reader/sparknlp_reader.py
@@ -322,4 +322,49 @@ def txt(self, docPath):
if not isinstance(docPath, str):
raise TypeError("docPath must be a string")
jdf = self._java_obj.txt(docPath)
+ return self.getDataFrame(self.spark, jdf)
+
+ def xml(self, docPath):
+ """Reads XML files and returns a Spark DataFrame.
+
+ Parameters
+ ----------
+ docPath : str
+ Path to an XML file or a directory containing XML files.
+
+ Returns
+ -------
+ pyspark.sql.DataFrame
+ A DataFrame containing parsed XML content.
+
+ Examples
+ --------
+ >>> from sparknlp.reader import SparkNLPReader
+ >>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
+
+ You can use SparkNLP for one line of code
+
+ >>> import sparknlp
+ >>> xml_df = sparknlp.read().xml("home/user/xml-directory")
+ >>> xml_df.show(truncate=False)
+ +-----------------------------------------------------------+
+ |xml |
+ +-----------------------------------------------------------+
+ |[{Title, John Smith, {elementId -> ..., tag -> title}}] |
+ +-----------------------------------------------------------+
+
+ >>> xml_df.printSchema()
+ root
+ |-- path: string (nullable = true)
+ |-- xml: array (nullable = true)
+ | |-- element: struct (containsNull = true)
+ | | |-- elementType: string (nullable = true)
+ | | |-- content: string (nullable = true)
+ | | |-- metadata: map (nullable = true)
+ | | | |-- key: string
+ | | | |-- value: string (valueContainsNull = true)
+ """
+ if not isinstance(docPath, str):
+ raise TypeError("docPath must be a string")
+ jdf = self._java_obj.xml(docPath)
return self.getDataFrame(self.spark, jdf)
\ No newline at end of file
diff --git a/python/test/sparknlp_test.py b/python/test/sparknlp_test.py
index 68ea10b36476bf..c2baa14fec213d 100644
--- a/python/test/sparknlp_test.py
+++ b/python/test/sparknlp_test.py
@@ -125,4 +125,18 @@ def runTest(self):
txt_df = sparknlp.read().txt(self.txt_file)
txt_df.show()
- self.assertTrue(txt_df.select("txt").count() > 0)
\ No newline at end of file
+ self.assertTrue(txt_df.select("txt").count() > 0)
+
+
+@pytest.mark.fast
+class SparkNLPTestXMLFilesSpec(unittest.TestCase):
+
+ def setUp(self):
+ self.data = SparkContextForTest.data
+ self.xml_files = f"file:///{os.getcwd()}/../src/test/resources/reader/xml"
+
+ def runTest(self):
+ xml_df = sparknlp.read().xml(self.xml_files)
+ xml_df.show()
+
+ self.assertTrue(xml_df.select("xml").count() > 0)
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala
new file mode 100644
index 00000000000000..4993bc65a8cd8b
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable
+import org.apache.spark.ml.param.Param
+
+trait HasXmlReaderProperties extends ParamsAndFeaturesWritable {
+
+ val xmlKeepTags = new Param[Boolean](
+ this,
+ "xmlKeepTags",
+ "Whether to include XML tag names as metadata in the output.")
+
+ def setXmlKeepTags(value: Boolean): this.type = set(xmlKeepTags, value)
+
+ val onlyLeafNodes = new Param[Boolean](
+ this,
+ "onlyLeafNodes",
+ "If true, only processes XML leaf nodes (no nested children).")
+
+ def setOnlyLeafNodes(value: Boolean): this.type = set(onlyLeafNodes, value)
+
+ setDefault(xmlKeepTags -> false, onlyLeafNodes -> true)
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala
index a339fe9b258ee4..2e6f69b8c5b4c4 100644
--- a/src/main/scala/com/johnsnowlabs/partition/Partition.scala
+++ b/src/main/scala/com/johnsnowlabs/partition/Partition.scala
@@ -188,6 +188,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
"application/vnd.openxmlformats-officedocument.presentationml.presentation" =>
sparkNLPReader.ppt
case "application/pdf" => sparkNLPReader.pdf
+ case "application/xml" => sparkNLPReader.xml
case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
}
}
@@ -199,6 +200,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
case "text/plain" => sparkNLPReader.txtToHTMLElement
case "text/html" => sparkNLPReader.htmlToHTMLElement
case "url" => sparkNLPReader.urlToHTMLElement
+ case "application/xml" => sparkNLPReader.xmlToHTMLElement
case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
}
}
@@ -234,6 +236,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
case "xls" | "xlsx" => sparkNLPReader.xls
case "ppt" | "pptx" => sparkNLPReader.ppt
case "pdf" => sparkNLPReader.pdf
+ case "xml" => sparkNLPReader.xml
case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")
}
}
diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
index 73d461c91aaafc..281af53931d72c 100644
--- a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
+++ b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
@@ -86,6 +86,7 @@ class PartitionTransformer(override val uid: String)
with HasPowerPointProperties
with HasTextReaderProperties
with HasPdfProperties
+ with HasXmlReaderProperties
with HasChunkerProperties {
def this() = this(Identifiable.randomUID("PartitionTransformer"))
@@ -157,7 +158,9 @@ class PartitionTransformer(override val uid: String)
"newAfterNChars" -> $(newAfterNChars).toString,
"overlap" -> $(overlap).toString,
"combineTextUnderNChars" -> $(combineTextUnderNChars).toString,
- "overlapAll" -> $(overlapAll).toString)
+ "overlapAll" -> $(overlapAll).toString,
+ "xmlKeepTags" -> $(xmlKeepTags).toString,
+ "onlyLeafNodes" -> $(onlyLeafNodes).toString)
val partitionInstance = new Partition(params.asJava)
val inputColum = if (get(inputCols).isDefined) {
diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
index a1637116cb7905..216492876cc718 100644
--- a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
+++ b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
@@ -296,7 +296,6 @@ class SparkNLPReader(
* |-- width_dimension: integer (nullable = true)
* |-- content: binary (nullable = true)
* |-- exception: string (nullable = true)
- * |-- pagenum: integer (nullable = true)
* }}}
*
* @param params
@@ -642,4 +641,69 @@ class SparkNLPReader(
default = BLOCK_SPLIT_PATTERN)
}
+ /** Instantiates class to read XML files.
+ *
+ * xmlPath: this is a path to a directory of XML files or a path to an XML file. E.g.,
+ * "path/xml/files"
+ *
+ * ==Example==
+ * {{{
+ * val xmlPath = "home/user/xml-directory"
+ * val sparkNLPReader = new SparkNLPReader()
+ * val xmlDf = sparkNLPReader.xml(xmlPath)
+ * }}}
+ *
+ * ==Example 2==
+ * You can use SparkNLP for one line of code
+ * {{{
+ * val xmlDf = SparkNLP.read.xml(xmlPath)
+ * }}}
+ *
+ * {{{
+ * xmlDf.select("xml").show(false)
+ * +------------------------------------------------------------------------------------------------------------------------+
+ * |xml |
+ * +------------------------------------------------------------------------------------------------------------------------+
+ * |[{Title, John Smith, {elementId -> ..., tag -> title}}, {UncategorizedText, Some content..., {elementId -> ...}}] |
+ * +------------------------------------------------------------------------------------------------------------------------+
+ *
+ * xmlDf.printSchema()
+ * root
+ * |-- path: string (nullable = true)
+ * |-- xml: array (nullable = true)
+ * | |-- element: struct (containsNull = true)
+ * | | |-- elementType: string (nullable = true)
+ * | | |-- content: string (nullable = true)
+ * | | |-- metadata: map (nullable = true)
+ * | | | |-- key: string
+ * | | | |-- value: string (valueContainsNull = true)
+ * }}}
+ *
+ * @param xmlPath
+ * Path to the XML file or directory
+ * @return
+ * A DataFrame with parsed XML as structured elements
+ */
+
+ def xml(xmlPath: String): DataFrame = {
+ val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes)
+ xmlReader.read(xmlPath)
+ }
+
+ def xmlToHTMLElement(xml: String): Seq[HTMLElement] = {
+ val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes)
+ xmlReader.parseXml(xml)
+ }
+
+ private def getXmlKeepTags: Boolean = {
+ getDefaultBoolean(params.asScala.toMap, Seq("xmlKeepTags", "xml_keep_tags"), default = false)
+ }
+
+ private def getOnlyLeafNodes: Boolean = {
+ getDefaultBoolean(
+ params.asScala.toMap,
+ Seq("onlyLeafNodes", "only_leaf_nodes"),
+ default = true)
+ }
+
}
diff --git a/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala
new file mode 100644
index 00000000000000..fc777458dafb83
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.reader
+
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.nlp.util.io.ResourceHelper.validFile
+import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.{col, udf}
+
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+import scala.xml.{Elem, Node, XML}
+
+/** Class to parse and read XML files.
+ *
+ * @param storeContent
+ * Whether to include the raw XML content in the resulting DataFrame as a separate 'content'
+ * column. By default, this is false.
+ *
+ * @param xmlKeepTags
+ * Whether to retain original XML tag names and include them in the metadata for each extracted
+ * element. Useful for preserving structure. Default is false.
+ *
+ * @param onlyLeafNodes
+ * If true, only the deepest elements (those without child elements) are extracted. If false,
+ * all elements are extracted. Default is true.
+ *
+ * ==Input Format==
+ * Input must be a valid path to an XML file or a directory containing XML files.
+ *
+ * ==Example==
+ * {{{
+ * val xmlPath = "./data/sample.xml"
+ * val xmlReader = new XMLReader()
+ * val xmlDf = xmlReader.read(xmlPath)
+ * }}}
+ *
+ * {{{
+ * xmlDf.show(truncate = false)
+ * +----------------------+--------------------------------------------------+
+ * |path |xml |
+ * +----------------------+--------------------------------------------------+
+ * |file:/data/sample.xml |[{Title, My Book, {tag -> title}}, ...] |
+ * +----------------------+--------------------------------------------------+
+ *
+ * xmlDf.printSchema()
+ * root
+ * |-- path: string (nullable = true)
+ * |-- xml: array (nullable = true)
+ * | |-- element: struct (containsNull = true)
+ * | | |-- elementType: string (nullable = true)
+ * | | |-- content: string (nullable = true)
+ * | | |-- metadata: map (nullable = true)
+ * | | | |-- key: string
+ * | | | |-- value: string (valueContainsNull = true)
+ * }}}
+ *
+ * For more examples refer to:
+ * [[https://github.com/JohnSnowLabs/spark-nlp/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb notebook]]
+ */
+class XMLReader(
+ storeContent: Boolean = false,
+ xmlKeepTags: Boolean = false,
+ onlyLeafNodes: Boolean = true)
+ extends Serializable {
+
+ private lazy val spark = ResourceHelper.spark
+
+ private var outputColumn = "xml"
+
+ def setOutputColumn(value: String): this.type = {
+ require(value.nonEmpty, "Output column name cannot be empty.")
+ outputColumn = value
+ this
+ }
+
+ def read(inputSource: String): DataFrame = {
+ if (validFile(inputSource)) {
+ val xmlDf = datasetWithTextFile(spark, inputSource)
+ .withColumn(outputColumn, parseXmlUDF(col("content")))
+ if (storeContent) xmlDf.select("path", "content", outputColumn)
+ else xmlDf.select("path", outputColumn)
+ } else throw new IllegalArgumentException(s"Invalid inputSource: $inputSource")
+ }
+
+ private val parseXmlUDF = udf((xml: String) => {
+ parseXml(xml)
+ })
+
+ def parseXml(xmlString: String): List[HTMLElement] = {
+ val xml = XML.loadString(xmlString)
+ val elements = ListBuffer[HTMLElement]()
+
+ def traverse(node: Node, parentId: Option[String]): Unit = {
+ node match {
+ case elem: Elem =>
+ val tagName = elem.label.toLowerCase
+ val textContent = elem.text.trim
+ val elementId = hash(tagName + textContent)
+
+ val isLeaf = !elem.child.exists(_.isInstanceOf[Elem])
+
+ if (!onlyLeafNodes || isLeaf) {
+ val elementType = tagName match {
+ case "title" | "author" => ElementType.TITLE
+ case _ => ElementType.UNCATEGORIZED_TEXT
+ }
+
+ val metadata = mutable.Map[String, String]("elementId" -> elementId)
+ if (xmlKeepTags) metadata += ("tag" -> tagName)
+ parentId.foreach(id => metadata += ("parentId" -> id))
+
+ val content = if (isLeaf) textContent else ""
+ elements += HTMLElement(elementType, content, metadata)
+ }
+
+ // Traverse children
+ elem.child.foreach(traverse(_, Some(elementId)))
+
+ case _ => // Ignore other types
+ }
+ }
+
+ traverse(xml, None)
+ elements.toList
+ }
+
+ def hash(s: String): String = {
+ java.security.MessageDigest
+ .getInstance("MD5")
+ .digest(s.getBytes)
+ .map("%02x".format(_))
+ .mkString
+ }
+
+}
diff --git a/src/test/resources/reader/xml/multi-level.xml b/src/test/resources/reader/xml/multi-level.xml
new file mode 100644
index 00000000000000..e14e5ad684be30
--- /dev/null
+++ b/src/test/resources/reader/xml/multi-level.xml
@@ -0,0 +1,20 @@
+
+
+
+
+ The Alchemist
+ Paulo Coelho
+ 1988
+
+
+
+
+
+
+ A Brief History of Time
+ Stephen Hawking
+ 1988
+
+
+
+
diff --git a/src/test/resources/reader/xml/test.xml b/src/test/resources/reader/xml/test.xml
new file mode 100644
index 00000000000000..44bdab910b4c96
--- /dev/null
+++ b/src/test/resources/reader/xml/test.xml
@@ -0,0 +1,14 @@
+
+
+ Harry Potter
+ J K. Rowling
+ 2005
+ 29.99
+
+
+ Learning XML
+ Erik T. Ray
+ 2003
+ 39.95
+
+
\ No newline at end of file
diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala
index 9937b95f59e512..05c5916c843424 100644
--- a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala
+++ b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala
@@ -32,6 +32,7 @@ class PartitionTest extends AnyFlatSpec {
val emailDirectory = "src/test/resources/reader/email"
val htmlDirectory = "src/test/resources/reader/html"
val pdfDirectory = "src/test/resources/reader/pdf"
+ val xmlDirectory = "src/test/resources/reader/xml"
"Partition" should "work with text content_type" taggedAs FastTest in {
val textDf = Partition(Map("content_type" -> "text/plain")).partition(txtDirectory)
@@ -181,4 +182,11 @@ class PartitionTest extends AnyFlatSpec {
assert(elements == expectedElements)
}
+ it should "work with XML content_type" taggedAs FastTest in {
+ val pdfDf = Partition(Map("content_type" -> "application/xml")).partition(xmlDirectory)
+ pdfDf.show()
+
+ assert(!pdfDf.select(col("xml")).isEmpty)
+ }
+
}
diff --git a/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala
new file mode 100644
index 00000000000000..a75537803e61de
--- /dev/null
+++ b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala
@@ -0,0 +1,43 @@
+package com.johnsnowlabs.reader
+
+import com.johnsnowlabs.tags.FastTest
+import org.apache.spark.sql.functions.{array_contains, col, explode, map_keys}
+import org.scalatest.flatspec.AnyFlatSpec
+
+class XMLReaderTest extends AnyFlatSpec {
+
+ val xmlFilesDirectory = "./src/test/resources/reader/xml/"
+
+ "XMLReader" should "read xml as dataframe" taggedAs FastTest in {
+ val XMLReader = new XMLReader()
+ val xmlDF = XMLReader.read(s"$xmlFilesDirectory/test.xml")
+ xmlDF.show(truncate = false)
+
+ assert(!xmlDF.select(col("xml").getItem(0)).isEmpty)
+ assert(!xmlDF.columns.contains("content"))
+ }
+
+ it should "include tags in the output" taggedAs FastTest in {
+ val XMLReader = new XMLReader(xmlKeepTags = true)
+ val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml")
+ xmlDF.show(truncate = false)
+
+ val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml")))
+ val tagsDf = explodedDf.filter(col("xml_exploded.metadata")("tag") =!= "")
+
+ assert(tagsDf.count() > 0)
+ }
+
+ it should "output all nodes" taggedAs FastTest in {
+ val XMLReader = new XMLReader(onlyLeafNodes = false)
+ val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml")
+ xmlDF.show(truncate = false)
+ val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml")))
+
+ val noParentIdCount = explodedDf
+ .filter(!array_contains(map_keys(col("xml_exploded.metadata")), "parentId"))
+
+ assert(noParentIdCount.count() > 0)
+ }
+
+}