diff --git a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb
index c5eecc20945f3f..659bdbc309f127 100644
--- a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb
+++ b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb
@@ -20,8 +20,30 @@
     "## Setup and Initialization\n",
     "Let's keep in mind a few things before we start 😊\n",
     "\n",
-    "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release.\n",
-    "\n",
+    "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Let's install and setup Spark NLP in Google Colab\n",
+    "- This part is pretty easy via our simple script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "For local files example we will download different files from Spark NLP Github repo:"
    ]
   },
@@ -42,34 +64,34 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "bo7s-jZVrE7W",
-    "outputId": "e7234d36-765e-4a29-f922-02ceab1626dd"
+    "outputId": "b0e91448-3b2c-4dab-84c7-5e7d8bad0be5"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2025-05-26 23:11:05--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n",
+      "--2025-06-09 22:10:23--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 2456707 (2.3M) [text/plain]\n",
       "Saving to: ‘html-files/example-10k.html’\n",
       "\n",
-      "example-10k.html    100%[===================>]   2.34M  --.-KB/s    in 0.05s   \n",
+      "example-10k.html    100%[===================>]   2.34M  --.-KB/s    in 0.04s   \n",
       "\n",
-      "2025-05-26 23:11:06 (45.1 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n",
+      "2025-06-09 22:10:23 (52.9 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n",
       "\n",
-      "--2025-05-26 23:11:06--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n",
-      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
-      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+      "--2025-06-09 22:10:23--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 665 [text/plain]\n",
       "Saving to: ‘html-files/fake-html.html’\n",
       "\n",
       "fake-html.html      100%[===================>]     665  --.-KB/s    in 0s      \n",
       "\n",
-      "2025-05-26 23:11:06 (30.2 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n",
+      "2025-06-09 22:10:24 (18.3 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n",
       "\n"
      ]
     }
@@ -97,38 +119,36 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "ya8qZe00dalC",
-    "outputId": "ba520f44-c4b9-45b1-f03c-6a8e3a33320b"
+    "outputId": "9b4fbf52-9ecc-454b-bef1-0ce31dadb7c7"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2025-05-26 23:11:06--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n",
-      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...\n",
-      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
+      "--2025-06-09 22:10:24--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 15629 (15K) [application/octet-stream]\n",
       "Saving to: ‘pdf-files/image_3_pages.pdf’\n",
       "\n",
-      "\r",
-      "image_3_pages.pdf     0%[                    ]       0  --.-KB/s               \r",
       "image_3_pages.pdf   100%[===================>]  15.26K  --.-KB/s    in 0.001s  \n",
       "\n",
-      "2025-05-26 23:11:06 (25.5 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n",
+      "2025-06-09 22:10:24 (24.3 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n",
       "\n",
-      "--2025-05-26 23:11:06--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n",
+      "--2025-06-09 22:10:24--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 25803 (25K) [application/octet-stream]\n",
       "Saving to: ‘pdf-files/pdf-title.pdf’\n",
       "\n",
-      "pdf-title.pdf       100%[===================>]  25.20K  --.-KB/s    in 0s      \n",
+      "pdf-title.pdf       100%[===================>]  25.20K  --.-KB/s    in 0.001s  \n",
       "\n",
-      "2025-05-26 23:11:06 (58.5 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n",
+      "2025-06-09 22:10:24 (21.2 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n",
       "\n",
-      "--2025-05-26 23:11:07--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n",
+      "--2025-06-09 22:10:24--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
@@ -137,7 +157,7 @@
       "\n",
       "text_3_pages.pdf    100%[===================>]   9.26K  --.-KB/s    in 0s      \n",
       "\n",
-      "2025-05-26 23:11:07 (79.2 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n",
+      "2025-06-09 22:10:24 (73.3 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n",
       "\n"
      ]
     }
@@ -166,47 +186,45 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "zLLEUl3KpYZ6",
-    "outputId": "4346e6e1-18ec-47a8-92c0-c8bc588f3441"
+    "outputId": "407e9405-6cc9-4724-f576-f52c503cb52d"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2025-05-26 23:11:07--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n",
+      "--2025-06-09 22:10:25--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 95087 (93K) [application/octet-stream]\n",
       "Saving to: ‘word-files/contains-pictures.docx’\n",
       "\n",
-      "\r",
-      "contains-pictures.d   0%[                    ]       0  --.-KB/s               \r",
-      "contains-pictures.d 100%[===================>]  92.86K  --.-KB/s    in 0.01s   \n",
+      "contains-pictures.d 100%[===================>]  92.86K  --.-KB/s    in 0.02s   \n",
       "\n",
-      "2025-05-26 23:11:07 (6.85 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n",
+      "2025-06-09 22:10:25 (4.74 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n",
       "\n",
-      "--2025-05-26 23:11:07--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n",
-      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
-      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+      "--2025-06-09 22:10:25--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 12392 (12K) [application/octet-stream]\n",
       "Saving to: ‘word-files/fake_table.docx’\n",
       "\n",
       "fake_table.docx     100%[===================>]  12.10K  --.-KB/s    in 0.001s  \n",
       "\n",
-      "2025-05-26 23:11:07 (17.7 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n",
+      "2025-06-09 22:10:25 (18.9 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n",
       "\n",
-      "--2025-05-26 23:11:07--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n",
-      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
-      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+      "--2025-06-09 22:10:25--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 14584 (14K) [application/octet-stream]\n",
       "Saving to: ‘word-files/page-breaks.docx’\n",
       "\n",
       "page-breaks.docx    100%[===================>]  14.24K  --.-KB/s    in 0.001s  \n",
       "\n",
-      "2025-05-26 23:11:08 (22.4 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n",
+      "2025-06-09 22:10:25 (21.5 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n",
       "\n"
      ]
     }
@@ -235,48 +253,58 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "G3-BCYP6qQ4x",
-    "outputId": "38489a6e-588d-4a1b-e319-0c7f66559ca0"
+    "outputId": "95c5a31d-eed9-47a1-bb55-0868daec7da7"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2025-05-26 23:11:08--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n",
-      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
-      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+      "--2025-06-09 22:10:26--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 12541 (12K) [application/octet-stream]\n",
       "Saving to: ‘excel-files/vodafone.xlsx’\n",
       "\n",
       "\r",
       "vodafone.xlsx         0%[                    ]       0  --.-KB/s               \r",
-      "vodafone.xlsx       100%[===================>]  12.25K  --.-KB/s    in 0.001s  \n",
+      "vodafone.xlsx       100%[===================>]  12.25K  --.-KB/s    in 0s      \n",
       "\n",
-      "2025-05-26 23:11:08 (22.2 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n",
+      "2025-06-09 22:10:26 (30.4 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n",
       "\n",
-      "--2025-05-26 23:11:08--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n",
+      "--2025-06-09 22:10:26--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 38442 (38K) [application/octet-stream]\n",
       "Saving to: ‘excel-files/2023-half-year-analyses-by-segment.xlsx’\n",
       "\n",
-      "2023-half-year-anal 100%[===================>]  37.54K  --.-KB/s    in 0.007s  \n",
+      "2023-half-year-anal 100%[===================>]  37.54K  --.-KB/s    in 0.01s   \n",
       "\n",
-      "2025-05-26 23:11:08 (5.37 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n",
+      "2025-06-09 22:10:26 (3.43 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n",
       "\n",
-      "--2025-05-26 23:11:08--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n",
+      "--2025-06-09 22:10:26--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
-      "HTTP request sent, awaiting response... 404 Not Found\n",
-      "2025-05-26 23:11:09 ERROR 404: Not Found.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 10676 (10K) [application/octet-stream]\n",
+      "Saving to: ‘excel-files/page-break-example.xlsx’\n",
+      "\n",
+      "page-break-example. 100%[===================>]  10.43K  --.-KB/s    in 0s      \n",
       "\n",
-      "--2025-05-26 23:11:09--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n",
+      "2025-06-09 22:10:26 (79.4 MB/s) - ‘excel-files/page-break-example.xlsx’ saved [10676/10676]\n",
+      "\n",
+      "--2025-06-09 22:10:26--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
-      "HTTP request sent, awaiting response... 404 Not Found\n",
-      "2025-05-26 23:11:09 ERROR 404: Not Found.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 9210 (9.0K) [application/octet-stream]\n",
+      "Saving to: ‘excel-files/xlsx-subtable-cases.xlsx’\n",
+      "\n",
+      "xlsx-subtable-cases 100%[===================>]   8.99K  --.-KB/s    in 0s      \n",
+      "\n",
+      "2025-06-09 22:10:26 (65.5 MB/s) - ‘excel-files/xlsx-subtable-cases.xlsx’ saved [9210/9210]\n",
       "\n"
      ]
     }
@@ -289,17 +317,6 @@
     "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx -P excel-files"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "id": "LcSYn6q7jW9-"
-   },
-   "outputs": [],
-   "source": [
-    "!cp drive/MyDrive/JSL/PageBreakExample.xlsx ./excel-files"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -317,42 +334,45 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "1jDRFmcHqpxn",
-    "outputId": "4d59c445-3764-41a8-c91b-9231d401eac6"
+    "outputId": "cd7e3c96-bb5f-49ab-f466-56ec6be20f75"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2025-05-26 23:11:09--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n",
-      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
-      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+      "--2025-06-09 22:10:27--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 38412 (38K) [application/octet-stream]\n",
       "Saving to: ‘ppt-files/fake-power-point.pptx’\n",
       "\n",
-      "\r",
-      "fake-power-point.pp   0%[                    ]       0  --.-KB/s               \r",
-      "fake-power-point.pp 100%[===================>]  37.51K  --.-KB/s    in 0.007s  \n",
+      "fake-power-point.pp 100%[===================>]  37.51K  --.-KB/s    in 0.01s   \n",
       "\n",
-      "2025-05-26 23:11:10 (5.29 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n",
+      "2025-06-09 22:10:27 (3.41 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n",
       "\n",
-      "--2025-05-26 23:11:10--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n",
+      "--2025-06-09 22:10:27--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 39894 (39K) [application/octet-stream]\n",
       "Saving to: ‘ppt-files/fake-power-point-table.pptx’\n",
       "\n",
-      "fake-power-point-ta 100%[===================>]  38.96K  --.-KB/s    in 0.006s  \n",
+      "fake-power-point-ta 100%[===================>]  38.96K  --.-KB/s    in 0.008s  \n",
       "\n",
-      "2025-05-26 23:11:10 (6.73 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n",
+      "2025-06-09 22:10:28 (4.93 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n",
       "\n",
-      "--2025-05-26 23:11:10--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n",
+      "--2025-06-09 22:10:28--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
-      "HTTP request sent, awaiting response... 404 Not Found\n",
-      "2025-05-26 23:11:10 ERROR 404: Not Found.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 39414 (38K) [application/octet-stream]\n",
+      "Saving to: ‘ppt-files/speaker-notes.pptx’\n",
+      "\n",
+      "speaker-notes.pptx  100%[===================>]  38.49K  --.-KB/s    in 0.008s  \n",
+      "\n",
+      "2025-06-09 22:10:28 (4.76 MB/s) - ‘ppt-files/speaker-notes.pptx’ saved [39414/39414]\n",
       "\n"
      ]
     }
@@ -381,14 +401,14 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "yYMVpVQurk7G",
-    "outputId": "cedb0e39-f137-4759-a158-0b84ed31b282"
+    "outputId": "293a864a-2980-4502-c6dc-a1d3cee815ee"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2025-05-26 23:11:10--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n",
+      "--2025-06-09 22:10:28--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
@@ -399,18 +419,18 @@
       "          email-tex   0%[                    ]       0  --.-KB/s               \r",
       "email-text-attachme 100%[===================>]   3.10K  --.-KB/s    in 0s      \n",
       "\n",
-      "2025-05-26 23:11:11 (49.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n",
+      "2025-06-09 22:10:28 (21.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n",
       "\n",
-      "--2025-05-26 23:11:11--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n",
-      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n",
-      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
+      "--2025-06-09 22:10:28--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 1324361 (1.3M) [text/plain]\n",
       "Saving to: ‘email-files/test-several-attachments.eml’\n",
       "\n",
       "test-several-attach 100%[===================>]   1.26M  --.-KB/s    in 0.04s   \n",
       "\n",
-      "2025-05-26 23:11:11 (32.0 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n",
+      "2025-06-09 22:10:29 (30.2 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n",
       "\n"
      ]
     }
@@ -438,14 +458,14 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "AV-krG6Ps8pq",
-    "outputId": "c407a77f-11d5-4a3c-85e0-4abffa48bd12"
+    "outputId": "bd7317e0-97d3-4f30-a800-6ffa8148f266"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2025-05-26 23:11:11--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n",
+      "--2025-06-09 22:10:29--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n",
       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
@@ -456,7 +476,7 @@
       "simple-text.txt       0%[                    ]       0  --.-KB/s               \r",
       "simple-text.txt     100%[===================>]     300  --.-KB/s    in 0s      \n",
       "\n",
-      "2025-05-26 23:11:11 (4.81 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n",
+      "2025-06-09 22:10:29 (3.39 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n",
       "\n"
      ]
     }
@@ -466,6 +486,51 @@
     "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QVq5C0Uqs4wU"
+   },
+   "source": [
+    "**Downloading XML files**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Gip5P7Ess63U",
+    "outputId": "dde0fa15-2571-4b4a-ef73-517fe2b7a7a7"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2025-06-09 22:15:15--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 538 [text/plain]\n",
+      "Saving to: ‘xml-files/multi-level.xml’\n",
+      "\n",
+      "\r",
+      "multi-level.xml       0%[                    ]       0  --.-KB/s               \r",
+      "multi-level.xml     100%[===================>]     538  --.-KB/s    in 0s      \n",
+      "\n",
+      "2025-06-09 22:15:15 (21.2 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir xml-files\n",
+    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -478,13 +543,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "bAkMjJ1vdalE",
-    "outputId": "15401bcc-3cb2-474a-d771-0efed1eaf9cd"
+    "outputId": "582dcc26-76ea-4cac-c5f6-46e009b639f9"
    },
    "outputs": [
     {
@@ -519,13 +584,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "VWbUgoVQrO8m",
-    "outputId": "36bbf310-7ee5-474a-93f2-4d940d3c0547"
+    "outputId": "56f4f9ce-41bb-48ba-b5db-7e1bde47d8d8"
    },
    "outputs": [
     {
@@ -558,13 +623,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "YFzeGJJ3ICVM",
-    "outputId": "01c349aa-16d2-4e0d-8a30-11399caf2ef2"
+    "outputId": "fc9bc68c-2b20-479e-8fe8-3e380877cebf"
    },
    "outputs": [
     {
@@ -597,13 +662,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "y_xl0ahaJ0Hy",
-    "outputId": "6040b119-2eca-4c58-f51b-e20fbefeef8d"
+    "outputId": "327222b8-0c6b-4578-8fde-4f14f9835edc"
    },
    "outputs": [
     {
@@ -636,13 +701,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "4JnKvFe5KVDf",
-    "outputId": "d91d1ee5-d4a3-48a1-b40a-d5f6bf997025"
+    "outputId": "c9252fb7-3840-4c95-d461-a56eef9adaea"
    },
    "outputs": [
     {
@@ -675,13 +740,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "_sldwjppKoPl",
-    "outputId": "467e9085-86dd-43df-f63b-a707b920d3b3"
+    "outputId": "0619383d-abf4-43a6-f63d-ad81897f8d9e"
    },
    "outputs": [
     {
@@ -714,13 +779,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "GltbZAjmKwQs",
-    "outputId": "c3f18b1f-06df-4233-8874-e9702c465e69"
+    "outputId": "df9ae11b-0186-4e61-d6ff-9581c597ccd1"
    },
    "outputs": [
     {
@@ -731,9 +796,9 @@
       "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
       "|                path|    modificationTime|length|                text|height_dimension|width_dimension|content|exception|pagenum|\n",
       "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...|  9487|   This is a page.\\n|             841|            595|   NULL|     NULL|      0|\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...|  9487|This is another p...|             841|            595|   NULL|     NULL|      1|\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...|  9487| Yet another page.\\n|             841|            595|   NULL|     NULL|      2|\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...|  9487|   This is a page.\\n|             841|            595|   NULL|     NULL|      0|\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...|  9487|This is another p...|             841|            595|   NULL|     NULL|      1|\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...|  9487| Yet another page.\\n|             841|            595|   NULL|     NULL|      2|\n",
       "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
       "\n"
      ]
@@ -798,7 +863,9 @@
     "| `infer_table_structure` | Word, Excel, PowerPoint | Whether to generate an HTML table representation from structured table content. When enabled, a full `<table>` element is added alongside cell-level elements, based on row and column layout. |\n",
     "| `append_cells` | Excel | Whether to append all rows into a single content block instead of creating separate elements per row. |\n",
     "| `cell_separator` | Excel | String used to join cell values in a row when assembling textual output |\n",
-    "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |"
+    "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |\n",
+    "| `xml_keep_tags` | XML | Whether to retain original XML tag names and include them in the metadata for each extracted element |\n",
+    "| `only_leaf_nodes` | XML | If true, only the deepest elements are extracted. If false, all elements are extracted|"
    ]
   },
   {
@@ -812,13 +879,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "gDJyUi_9R4fr",
-    "outputId": "4aebe625-444d-4161-be23-512708ced1b5"
+    "outputId": "181d8e88-7a0b-4a6e-f497-7fd4add3726c"
    },
    "outputs": [
     {
@@ -830,8 +897,8 @@
       "|                path|                 doc|\n",
       "+--------------------+--------------------+\n",
       "|file:/content/wor...|[{NarrativeText, ...|\n",
-      "|file:/content/wor...|[{Header, An inli...|\n",
       "|file:/content/wor...|[{Table, Header C...|\n",
+      "|file:/content/wor...|[{Header, An inli...|\n",
       "+--------------------+--------------------+\n",
       "\n"
      ]
@@ -843,50 +910,23 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 23,
+   "cell_type": "markdown",
    "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "3vz48AHQHyON",
-    "outputId": "f3ba8c4b-3bfc-453a-d8d4-f86a5fca0a1b"
+    "id": "F0lCz9OyPYYh"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Warning::Spark Session already created, some configs may not take.\n",
-      "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
-      "|                path|    modificationTime|length|                text|height_dimension|width_dimension|content|exception|pagenum|\n",
-      "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...| 25803|This is a Title \\...|             842|            596|   NULL|     NULL|      0|\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...| 15629|                  \\n|             841|            595|   NULL|     NULL|      0|\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...| 15629|                  \\n|             841|            595|   NULL|     NULL|      1|\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...| 15629|                  \\n|             841|            595|   NULL|     NULL|      2|\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...|  9487|   This is a page.\\n|             841|            595|   NULL|     NULL|      0|\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...|  9487|This is another p...|             841|            595|   NULL|     NULL|      1|\n",
-      "|file:/content/pdf...|2025-05-26 23:11:...|  9487| Yet another page.\\n|             841|            595|   NULL|     NULL|      2|\n",
-      "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
-      "\n"
-     ]
-    }
-   ],
    "source": [
-    "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n",
-    "partition_df.show()"
+    "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 26,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
-    "id": "S50lqIFskNO3",
-    "outputId": "e52f4cde-cfb9-4a55-d989-6e9fe40a0321"
+    "id": "qExdRJ2aPsYV",
+    "outputId": "9a033a02-4bae-4570-aaba-b81c23b8e0e1"
    },
    "outputs": [
     {
@@ -894,38 +934,40 @@
      "output_type": "stream",
      "text": [
       "Warning::Spark Session already created, some configs may not take.\n",
-      "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|path                                           |xls                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n",
-      "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|file:/content/excel-files/PageBreakExample.xlsx|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}]|\n",
-      "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "+--------------------+--------------------+--------------------+\n",
+      "|                path|                 doc|             content|\n",
+      "+--------------------+--------------------+--------------------+\n",
+      "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n",
+      "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n",
+      "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n",
+      "+--------------------+--------------------+--------------------+\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "partition_df = Partition(content_type = \"application/vnd.ms-excel\").partition(\"./excel-files/PageBreakExample.xlsx\")\n",
-    "partition_df.show(truncate=False)"
+    "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n",
+    "partition_df.show()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {
-    "id": "F0lCz9OyPYYh"
+    "id": "E3bCFJZn8TS0"
    },
    "source": [
-    "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output"
+    "## Partitioning PDF Files"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
-    "id": "qExdRJ2aPsYV",
-    "outputId": "0284de34-ce6a-4d1e-91bc-268521111015"
+    "id": "3vz48AHQHyON",
+    "outputId": "19369e63-f963-4422-a791-57ea5394df1a"
    },
    "outputs": [
     {
@@ -933,19 +975,23 @@
      "output_type": "stream",
      "text": [
       "Warning::Spark Session already created, some configs may not take.\n",
-      "+--------------------+--------------------+--------------------+\n",
-      "|                path|                 doc|             content|\n",
-      "+--------------------+--------------------+--------------------+\n",
-      "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n",
-      "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n",
-      "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n",
-      "+--------------------+--------------------+--------------------+\n",
+      "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
+      "|                path|    modificationTime|length|                text|height_dimension|width_dimension|content|exception|pagenum|\n",
+      "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...| 25803|This is a Title \\...|             842|            596|   NULL|     NULL|      0|\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...| 15629|                  \\n|             841|            595|   NULL|     NULL|      0|\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...| 15629|                  \\n|             841|            595|   NULL|     NULL|      1|\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...| 15629|                  \\n|             841|            595|   NULL|     NULL|      2|\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...|  9487|   This is a page.\\n|             841|            595|   NULL|     NULL|      0|\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...|  9487|This is another p...|             841|            595|   NULL|     NULL|      1|\n",
+      "|file:/content/pdf...|2025-06-09 22:10:...|  9487| Yet another page.\\n|             841|            595|   NULL|     NULL|      2|\n",
+      "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n",
+    "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n",
     "partition_df.show()"
    ]
   },
@@ -969,13 +1015,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 27,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "_2J0zEmma8jm",
-    "outputId": "405391bf-60bf-4632-ef0e-e84496049c71"
+    "outputId": "90f668d7-03d9-496f-dc82-a620c59f9c08"
    },
    "outputs": [
     {
@@ -1018,13 +1064,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 28,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "4sY2ADN8dusy",
-    "outputId": "98af2c82-8a55-46ff-f631-7775431820cb"
+    "outputId": "8164237e-6835-404a-d7a7-b5ef0ef99c6d"
    },
    "outputs": [
     {
@@ -1046,24 +1092,33 @@
     "partition_df.show(truncate=False)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uMyqJX-K7dss"
+   },
+   "source": [
+    "## Partitioning MS Office documents"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
     "id": "_9dDTCrpGdoN"
    },
    "source": [
-    "For Word documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output."
+    "For Excel documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 29,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "7ICTZmLGk3Sa",
-    "outputId": "5e31a551-2746-4c45-b933-56f55e4866c9"
+    "outputId": "1796055a-808c-4eff-fc86-14e29cf9b53e"
    },
    "outputs": [
     {
@@ -1087,13 +1142,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 30,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
     "id": "YId4UG1rOVQq",
-    "outputId": "7de8b4be-9936-4330-8a0f-019c3a55182a"
+    "outputId": "32827dea-d7b3-4137-abff-9e4502f8cd93"
    },
    "outputs": [
     {
@@ -1118,38 +1173,21 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "id": "jpRmFNPNNqkf"
-   },
-   "source": [
-    "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {
-    "id": "HwnYBQ5l7rDM"
+    "id": "E8ockED4NxLi"
    },
-   "outputs": [],
    "source": [
-    "text = (\n",
-    "            \"The big brown fox\\n\"\n",
-    "            \"was walking down the lane.\\n\"\n",
-    "            \"\\n\"\n",
-    "            \"At the end of the lane,\\n\"\n",
-    "            \"the fox met a bear.\"\n",
-    "        )"
+    "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 34,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
-    "id": "mutwZUFj720X",
-    "outputId": "87cd31c5-2f94-4777-9ea5-b6edf8277347"
+    "id": "fPCpk7RTGRjo",
+    "outputId": "a818ecd7-8580-4098-b30f-6e46b8ef6baa"
    },
    "outputs": [
     {
@@ -1157,61 +1195,77 @@
      "output_type": "stream",
      "text": [
       "Warning::Spark Session already created, some configs may not take.\n",
-      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|txt                                                                                                                                                              |\n",
-      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n",
-      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|path                                      |ppt                                                                                                                                                                                                                                                                                                                      |\n",
+      "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n",
+      "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n",
-    "text_df.show(truncate=False)"
+    "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n",
+    "partition_df.show(truncate=False)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {
-    "id": "E8ockED4NxLi"
+    "id": "qRfRSGvhN303"
    },
    "source": [
-    "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output."
+    "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 35,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
-    "id": "xF8F-5CP3qWY",
-    "outputId": "71b5e0cb-b22a-4774-a7b6-83c4fd67fadb"
+    "id": "twLdjGxZWiOJ",
+    "outputId": "8adcaa80-b02c-4e8f-8205-20efa8c40b4b"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "fake-power-point.pptx  fake-power-point-table.pptx\n"
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|xls                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML, <table><tr><td>Date</td><td>Fri Jul 19 00:00:00 UTC 2024</td></tr><tr><td>Assets</td><td>Debts</td></tr><tr><td>Bank1</td><td>5865.43</td><td>Credit Card1</td><td>2000.0</td></tr><tr><td>Bank2</td><td>10140.19</td><td>Credit Card2</td><td>1500.0</td></tr><tr><td>Bank3</td><td>1200.0</td><td>Credit Card3</td><td>348.0</td></tr><tr><td>Bank4</td><td>1438.27</td><td>Total</td><td>SUM(F3:F5)</td></tr><tr><td>Total</td><td>SUM(B3:B6)</td></tr></table>, {SheetName -> Sheet1}}]|\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "\n"
      ]
     }
    ],
    "source": [
-    "!ls ppt-files"
+    "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n",
+    "partition_df.select(\"xls\").show(truncate=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8txswwbjN8Mg"
+   },
+   "source": [
+    "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 36,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
-    "id": "fPCpk7RTGRjo",
-    "outputId": "74144c26-5060-4c99-f291-a097b838e774"
+    "id": "PQ4MpGw6xCko",
+    "outputId": "aaf807a7-27b9-40cc-8a75-58be077f8403"
    },
    "outputs": [
     {
@@ -1219,38 +1273,64 @@
      "output_type": "stream",
      "text": [
       "Warning::Spark Session already created, some configs may not take.\n",
-      "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|path                                      |ppt                                                                                                                                                                                                                                                                                                                      |\n",
-      "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n",
-      "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|xls                                                                                                                                                                                                                                                      |\n",
+      "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n",
+      "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n",
-    "partition_df.show(truncate=False)"
+    "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n",
+    "partition_df.select(\"xls\").show(truncate=False)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {
-    "id": "qRfRSGvhN303"
+    "id": "_GyL6D4N75i-"
    },
    "source": [
-    "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display."
+    "## Partitioning Text Files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "jpRmFNPNNqkf"
+   },
+   "source": [
+    "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "id": "HwnYBQ5l7rDM"
+   },
+   "outputs": [],
+   "source": [
+    "text = (\n",
+    "            \"The big brown fox\\n\"\n",
+    "            \"was walking down the lane.\\n\"\n",
+    "            \"\\n\"\n",
+    "            \"At the end of the lane,\\n\"\n",
+    "            \"the fox met a bear.\"\n",
+    "        )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 32,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
-    "id": "twLdjGxZWiOJ",
-    "outputId": "ec340358-7279-4247-b27c-5a0a25f38ee6"
+    "id": "mutwZUFj720X",
+    "outputId": "8b4f474d-2f3f-4e81-cecf-5de420561124"
    },
    "outputs": [
     {
@@ -1258,38 +1338,47 @@
      "output_type": "stream",
      "text": [
       "Warning::Spark Session already created, some configs may not take.\n",
-      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|xls                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |\n",
-      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML, <table><tr><td>Date</td><td>Fri Jul 19 00:00:00 UTC 2024</td></tr><tr><td>Assets</td><td>Debts</td></tr><tr><td>Bank1</td><td>5865.43</td><td>Credit Card1</td><td>2000.0</td></tr><tr><td>Bank2</td><td>10140.19</td><td>Credit Card2</td><td>1500.0</td></tr><tr><td>Bank3</td><td>1200.0</td><td>Credit Card3</td><td>348.0</td></tr><tr><td>Bank4</td><td>1438.27</td><td>Total</td><td>SUM(F3:F5)</td></tr><tr><td>Total</td><td>SUM(B3:B6)</td></tr></table>, {SheetName -> Sheet1}}]|\n",
-      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|txt                                                                                                                                                              |\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n",
-    "partition_df.select(\"xls\").show(truncate=False)"
+    "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n",
+    "text_df.show(truncate=False)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {
-    "id": "8txswwbjN8Mg"
+    "id": "epCp5DnQ8E7o"
    },
    "source": [
-    "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually."
+    "## Partitioning XML Files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "DWX0nkc4tM7J"
+   },
+   "source": [
+    "In Spark NLP 6.0.3 we added support for XML files"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 45,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
-    "id": "PQ4MpGw6xCko",
-    "outputId": "808783d2-f15b-45ae-90fb-a623243898f3"
+    "id": "AViMSzKQtP-o",
+    "outputId": "147a1ef9-3f14-4832-a050-e60c8ac9544b"
    },
    "outputs": [
     {
@@ -1297,18 +1386,18 @@
      "output_type": "stream",
      "text": [
       "Warning::Spark Session already created, some configs may not take.\n",
-      "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|xls                                                                                                                                                                                                                                                      |\n",
-      "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
-      "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n",
-      "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|xml                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}]|\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n",
-    "partition_df.select(\"xls\").show(truncate=False)"
+    "partition_df = Partition(xml_keep_tags = True).partition(\"./xml-files/multi-level.xml\")\n",
+    "partition_df.select(\"xml\").show(truncate=False)"
    ]
   }
  ],
diff --git a/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb
new file mode 100644
index 00000000000000..38b43aed37b95e
--- /dev/null
+++ b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb
@@ -0,0 +1,339 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tzcU5p2gdak9"
+   },
+   "source": [
+    "# Introducing XML reader in SparkNLP\n",
+    "This notebook showcases the newly added  `sparknlp.read().xml()` method in Spark NLP that parses XML content from both local files and real-time URLs into a Spark DataFrame.\n",
+    "\n",
+    "**Key Features:**\n",
+    "- Ability to parse XML from local directories and URLs.\n",
+    "- Versatile support for varied data ingestion scenarios."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RFOFhaEedalB"
+   },
+   "source": [
+    "## Setup and Initialization\n",
+    "Let's keep in mind a few things before we start 😊\n",
+    "\n",
+    "Support for reading xml files was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Y3hWfT5q-npM"
+   },
+   "source": [
+    "- Let's install and setup Spark NLP in Google Colab\n",
+    "- This part is pretty easy via our simple script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "u3ORYVyb-pRI"
+   },
+   "outputs": [],
+   "source": [
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "oIbFQyEo-tat"
+   },
+   "source": [
+    "For local files example we will download a couple of XML files from Spark NLP Github repo:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ya8qZe00dalC",
+    "outputId": "7d597910-9826-4472-9fdc-5b8ac398e6cf"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2025-06-09 21:43:40--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 538 [text/plain]\n",
+      "Saving to: ‘xml-files/multi-level.xml’\n",
+      "\n",
+      "\r",
+      "multi-level.xml       0%[                    ]       0  --.-KB/s               \r",
+      "multi-level.xml     100%[===================>]     538  --.-KB/s    in 0s      \n",
+      "\n",
+      "2025-06-09 21:43:40 (34.0 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n",
+      "\n",
+      "--2025-06-09 21:43:40--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/test.xml\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 382 [text/plain]\n",
+      "Saving to: ‘xml-files/test.xml’\n",
+      "\n",
+      "test.xml            100%[===================>]     382  --.-KB/s    in 0s      \n",
+      "\n",
+      "2025-06-09 21:43:40 (7.58 MB/s) - ‘xml-files/test.xml’ saved [382/382]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir xml-files\n",
+    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files\n",
+    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/test.xml -P xml-files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EoFI66NAdalE"
+   },
+   "source": [
+    "## Parsing XML from Local Files\n",
+    "Use the `xml()` method to parse XML content from local directories."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "bAkMjJ1vdalE",
+    "outputId": "0bba10be-75de-48de-9a06-d6197d35218f"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "+--------------------+--------------------+\n",
+      "|                path|                 xml|\n",
+      "+--------------------+--------------------+\n",
+      "|file:/content/xml...|[{Title, Harry Po...|\n",
+      "|file:/content/xml...|[{Title, The Alch...|\n",
+      "+--------------------+--------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sparknlp\n",
+    "xml_df = sparknlp.read().xml(\"./xml-files\")\n",
+    "\n",
+    "xml_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "oBj0cHPXSD1m",
+    "outputId": "00951736-40d4-4f9e-fe25-cc5117405269"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- path: string (nullable = true)\n",
+      " |-- xml: array (nullable = true)\n",
+      " |    |-- element: struct (containsNull = true)\n",
+      " |    |    |-- elementType: string (nullable = true)\n",
+      " |    |    |-- content: string (nullable = true)\n",
+      " |    |    |-- metadata: map (nullable = true)\n",
+      " |    |    |    |-- key: string\n",
+      " |    |    |    |-- value: string (valueContainsNull = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "xml_df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "FrVKxdySz8pR"
+   },
+   "source": [
+    "### Configuration Parameters"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "CC_klLwhV8um"
+   },
+   "source": [
+    "`xmlKeepTags`: When true, includes the tag name of each XML element in the metadata under the key `tag`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "aNfN0fQC0Vzz",
+    "outputId": "ebdb1393-b91c-4c60-d7e7-b7ecc6465171"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|xml                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> title}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> author}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> year}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> price}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> title}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> author}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> year}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> price}}]|\n",
+      "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}]                                                                                                                                                                                                                                                                  |\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "params = {\"xmlKeepTags\": \"true\"}\n",
+    "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n",
+    "xml_df.select(\"xml\").show(truncate=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "t06KtTItWQ4R"
+   },
+   "source": [
+    "`onlyLeafNodes`: When true, includes only leaf elements (i.e., elements with no child elements) in the output. When false, all elements (including containers) are included."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "jTM1btqNntUL",
+    "outputId": "f86a0b28-73ac-46d1-8d26-f920e2d935cd"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|xml                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|[{UncategorizedText, , {elementId -> 931f811d0c9b488a01a7875f80992a62}}, {UncategorizedText, , {elementId -> 1f610d9429ab17d0d7ab49ee3069b4fc, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, , {elementId -> 249aff1b3e9835325b45e51cdfc4ad46, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}]                                                                                                                                                                                                                                                  |\n",
+      "|[{UncategorizedText, , {elementId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> d7416d9cac3ba3af57ef6b6b71d7841b, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> b79ae4ca74ec00f63a00b6cd66acc1e0, parentId -> d7416d9cac3ba3af57ef6b6b71d7841b}}, {UncategorizedText, , {elementId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, parentId -> b79ae4ca74ec00f63a00b6cd66acc1e0}}, {Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, , {elementId -> 9ebecf846e7dea80c563ebcb2f7d4a9a, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> 80472cd1880f453b8adecc61870748ba, parentId -> 9ebecf846e7dea80c563ebcb2f7d4a9a}}, {UncategorizedText, , {elementId -> 9708b29025b53d9f54c723ee005b647b, parentId -> 80472cd1880f453b8adecc61870748ba}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}]|\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "params = {\"onlyLeafNodes\": \"false\"}\n",
+    "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n",
+    "xml_df.select(\"xml\").show(truncate=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "O8DePUq8nkYm"
+   },
+   "source": [
+    "You can access the raw content of the file using the `storeContent` parameter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "E0S5aRb5WFLf",
+    "outputId": "5e624eeb-fbc1-47a4-ff21-aef410a10bb2"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|path                                   |content                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |xml                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |\n",
+      "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|file:/content/xml-files/test.xml       |<bookstore>\\n    <book category=\"children\">\\n        <title lang=\"en\">Harry Potter</title>\\n        <author>J K. Rowling</author>\\n        <year>2005</year>\\n        <price>29.99</price>\\n    </book>\\n    <book category=\"web\">\\n        <title lang=\"en\">Learning XML</title>\\n        <author>Erik T. Ray</author>\\n        <year>2003</year>\\n        <price>39.95</price>\\n    </book>\\n</bookstore>                                                                                                                                                                   |[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}]|\n",
+      "|file:/content/xml-files/multi-level.xml|<library>\\n    <section name=\"Fiction\">\\n        <shelf number=\"1\">\\n            <book>\\n                <title>The Alchemist</title>\\n                <author>Paulo Coelho</author>\\n                <year>1988</year>\\n            </book>\\n        </shelf>\\n    </section>\\n    <section name=\"Science\">\\n        <shelf number=\"2\">\\n            <book>\\n                <title>A Brief History of Time</title>\\n                <author>Stephen Hawking</author>\\n                <year>1988</year>\\n            </book>\\n        </shelf>\\n    </section>\\n</library>\\n|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}]                                                                                                                                                                                                                                      |\n",
+      "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "params = {\"storeContent\": \"true\"}\n",
+    "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n",
+    "xml_df.show(truncate=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py
index dfd865116f3821..86bf5781053050 100644
--- a/python/sparknlp/reader/sparknlp_reader.py
+++ b/python/sparknlp/reader/sparknlp_reader.py
@@ -322,4 +322,49 @@ def txt(self, docPath):
         if not isinstance(docPath, str):
             raise TypeError("docPath must be a string")
         jdf = self._java_obj.txt(docPath)
+        return self.getDataFrame(self.spark, jdf)
+
+    def xml(self, docPath):
+        """Reads XML files and returns a Spark DataFrame.
+
+        Parameters
+        ----------
+        docPath : str
+            Path to an XML file or a directory containing XML files.
+
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            A DataFrame containing parsed XML content.
+
+        Examples
+        --------
+        >>> from sparknlp.reader import SparkNLPReader
+        >>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
+
+        You can use SparkNLP for one line of code
+
+        >>> import sparknlp
+        >>> xml_df = sparknlp.read().xml("home/user/xml-directory")
+        >>> xml_df.show(truncate=False)
+        +-----------------------------------------------------------+
+        |xml                                                       |
+        +-----------------------------------------------------------+
+        |[{Title, John Smith, {elementId -> ..., tag -> title}}]   |
+        +-----------------------------------------------------------+
+
+        >>> xml_df.printSchema()
+        root
+         |-- path: string (nullable = true)
+         |-- xml: array (nullable = true)
+         |    |-- element: struct (containsNull = true)
+         |    |    |-- elementType: string (nullable = true)
+         |    |    |-- content: string (nullable = true)
+         |    |    |-- metadata: map (nullable = true)
+         |    |    |    |-- key: string
+         |    |    |    |-- value: string (valueContainsNull = true)
+        """
+        if not isinstance(docPath, str):
+            raise TypeError("docPath must be a string")
+        jdf = self._java_obj.xml(docPath)
         return self.getDataFrame(self.spark, jdf)
\ No newline at end of file
diff --git a/python/test/sparknlp_test.py b/python/test/sparknlp_test.py
index 68ea10b36476bf..c2baa14fec213d 100644
--- a/python/test/sparknlp_test.py
+++ b/python/test/sparknlp_test.py
@@ -125,4 +125,18 @@ def runTest(self):
         txt_df = sparknlp.read().txt(self.txt_file)
         txt_df.show()
 
-        self.assertTrue(txt_df.select("txt").count() > 0)
\ No newline at end of file
+        self.assertTrue(txt_df.select("txt").count() > 0)
+
+
+@pytest.mark.fast
+class SparkNLPTestXMLFilesSpec(unittest.TestCase):
+
+    def setUp(self):
+        self.data = SparkContextForTest.data
+        self.xml_files = f"file:///{os.getcwd()}/../src/test/resources/reader/xml"
+
+    def runTest(self):
+        xml_df = sparknlp.read().xml(self.xml_files)
+        xml_df.show()
+
+        self.assertTrue(xml_df.select("xml").count() > 0)
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala
new file mode 100644
index 00000000000000..4993bc65a8cd8b
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable
+import org.apache.spark.ml.param.Param
+
+trait HasXmlReaderProperties extends ParamsAndFeaturesWritable {
+
+  val xmlKeepTags = new Param[Boolean](
+    this,
+    "xmlKeepTags",
+    "Whether to include XML tag names as metadata in the output.")
+
+  def setXmlKeepTags(value: Boolean): this.type = set(xmlKeepTags, value)
+
+  val onlyLeafNodes = new Param[Boolean](
+    this,
+    "onlyLeafNodes",
+    "If true, only processes XML leaf nodes (no nested children).")
+
+  def setOnlyLeafNodes(value: Boolean): this.type = set(onlyLeafNodes, value)
+
+  setDefault(xmlKeepTags -> false, onlyLeafNodes -> true)
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala
index a339fe9b258ee4..2e6f69b8c5b4c4 100644
--- a/src/main/scala/com/johnsnowlabs/partition/Partition.scala
+++ b/src/main/scala/com/johnsnowlabs/partition/Partition.scala
@@ -188,6 +188,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
           "application/vnd.openxmlformats-officedocument.presentationml.presentation" =>
         sparkNLPReader.ppt
       case "application/pdf" => sparkNLPReader.pdf
+      case "application/xml" => sparkNLPReader.xml
       case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
     }
   }
@@ -199,6 +200,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "text/plain" => sparkNLPReader.txtToHTMLElement
       case "text/html" => sparkNLPReader.htmlToHTMLElement
       case "url" => sparkNLPReader.urlToHTMLElement
+      case "application/xml" => sparkNLPReader.xmlToHTMLElement
       case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
     }
   }
@@ -234,6 +236,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "xls" | "xlsx" => sparkNLPReader.xls
       case "ppt" | "pptx" => sparkNLPReader.ppt
       case "pdf" => sparkNLPReader.pdf
+      case "xml" => sparkNLPReader.xml
       case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")
     }
   }
diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
index 73d461c91aaafc..281af53931d72c 100644
--- a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
+++ b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
@@ -86,6 +86,7 @@ class PartitionTransformer(override val uid: String)
     with HasPowerPointProperties
     with HasTextReaderProperties
     with HasPdfProperties
+    with HasXmlReaderProperties
     with HasChunkerProperties {
 
   def this() = this(Identifiable.randomUID("PartitionTransformer"))
@@ -157,7 +158,9 @@ class PartitionTransformer(override val uid: String)
       "newAfterNChars" -> $(newAfterNChars).toString,
       "overlap" -> $(overlap).toString,
       "combineTextUnderNChars" -> $(combineTextUnderNChars).toString,
-      "overlapAll" -> $(overlapAll).toString)
+      "overlapAll" -> $(overlapAll).toString,
+      "xmlKeepTags" -> $(xmlKeepTags).toString,
+      "onlyLeafNodes" -> $(onlyLeafNodes).toString)
     val partitionInstance = new Partition(params.asJava)
 
     val inputColum = if (get(inputCols).isDefined) {
diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
index a1637116cb7905..216492876cc718 100644
--- a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
+++ b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
@@ -296,7 +296,6 @@ class SparkNLPReader(
     *  |-- width_dimension: integer (nullable = true)
     *  |-- content: binary (nullable = true)
     *  |-- exception: string (nullable = true)
-    *  |-- pagenum: integer (nullable = true)
     * }}}
     *
     * @param params
@@ -642,4 +641,69 @@ class SparkNLPReader(
       default = BLOCK_SPLIT_PATTERN)
   }
 
+  /** Instantiates class to read XML files.
+    *
+    * xmlPath: this is a path to a directory of XML files or a path to an XML file. E.g.,
+    * "path/xml/files"
+    *
+    * ==Example==
+    * {{{
+    * val xmlPath = "home/user/xml-directory"
+    * val sparkNLPReader = new SparkNLPReader()
+    * val xmlDf = sparkNLPReader.xml(xmlPath)
+    * }}}
+    *
+    * ==Example 2==
+    * You can use SparkNLP for one line of code
+    * {{{
+    * val xmlDf = SparkNLP.read.xml(xmlPath)
+    * }}}
+    *
+    * {{{
+    * xmlDf.select("xml").show(false)
+    * +------------------------------------------------------------------------------------------------------------------------+
+    * |xml                                                                                                                    |
+    * +------------------------------------------------------------------------------------------------------------------------+
+    * |[{Title, John Smith, {elementId -> ..., tag -> title}}, {UncategorizedText, Some content..., {elementId -> ...}}]     |
+    * +------------------------------------------------------------------------------------------------------------------------+
+    *
+    * xmlDf.printSchema()
+    * root
+    *  |-- path: string (nullable = true)
+    *  |-- xml: array (nullable = true)
+    *  |    |-- element: struct (containsNull = true)
+    *  |    |    |-- elementType: string (nullable = true)
+    *  |    |    |-- content: string (nullable = true)
+    *  |    |    |-- metadata: map (nullable = true)
+    *  |    |    |    |-- key: string
+    *  |    |    |    |-- value: string (valueContainsNull = true)
+    * }}}
+    *
+    * @param xmlPath
+    *   Path to the XML file or directory
+    * @return
+    *   A DataFrame with parsed XML as structured elements
+    */
+
+  def xml(xmlPath: String): DataFrame = {
+    val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes)
+    xmlReader.read(xmlPath)
+  }
+
+  def xmlToHTMLElement(xml: String): Seq[HTMLElement] = {
+    val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes)
+    xmlReader.parseXml(xml)
+  }
+
+  private def getXmlKeepTags: Boolean = {
+    getDefaultBoolean(params.asScala.toMap, Seq("xmlKeepTags", "xml_keep_tags"), default = false)
+  }
+
+  private def getOnlyLeafNodes: Boolean = {
+    getDefaultBoolean(
+      params.asScala.toMap,
+      Seq("onlyLeafNodes", "only_leaf_nodes"),
+      default = true)
+  }
+
 }
diff --git a/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala
new file mode 100644
index 00000000000000..fc777458dafb83
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.reader
+
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.nlp.util.io.ResourceHelper.validFile
+import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.{col, udf}
+
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+import scala.xml.{Elem, Node, XML}
+
+/** Class to parse and read XML files.
+ *
+ * @param storeContent
+ *   Whether to include the raw XML content in the resulting DataFrame as a separate 'content'
+ *   column. By default, this is false.
+ *
+ * @param xmlKeepTags
+ *   Whether to retain original XML tag names and include them in the metadata for each extracted
+ *   element. Useful for preserving structure. Default is false.
+ *
+ * @param onlyLeafNodes
+ *   If true, only the deepest elements (those without child elements) are extracted. If false,
+ *   all elements are extracted. Default is true.
+ *
+ * ==Input Format==
+ * Input must be a valid path to an XML file or a directory containing XML files.
+ *
+ * ==Example==
+ * {{{
+ * val xmlPath = "./data/sample.xml"
+ * val xmlReader = new XMLReader()
+ * val xmlDf = xmlReader.read(xmlPath)
+ * }}}
+ *
+ * {{{
+ * xmlDf.show(truncate = false)
+ * +----------------------+--------------------------------------------------+
+ * |path                  |xml                                               |
+ * +----------------------+--------------------------------------------------+
+ * |file:/data/sample.xml |[{Title, My Book, {tag -> title}}, ...]          |
+ * +----------------------+--------------------------------------------------+
+ *
+ * xmlDf.printSchema()
+ * root
+ *  |-- path: string (nullable = true)
+ *  |-- xml: array (nullable = true)
+ *  |    |-- element: struct (containsNull = true)
+ *  |    |    |-- elementType: string (nullable = true)
+ *  |    |    |-- content: string (nullable = true)
+ *  |    |    |-- metadata: map (nullable = true)
+ *  |    |    |    |-- key: string
+ *  |    |    |    |-- value: string (valueContainsNull = true)
+ * }}}
+ *
+ * For more examples refer to:
+ * [[https://github.com/JohnSnowLabs/spark-nlp/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb notebook]]
+ */
+class XMLReader(
+    storeContent: Boolean = false,
+    xmlKeepTags: Boolean = false,
+    onlyLeafNodes: Boolean = true)
+    extends Serializable {
+
+  private lazy val spark = ResourceHelper.spark
+
+  private var outputColumn = "xml"
+
+  def setOutputColumn(value: String): this.type = {
+    require(value.nonEmpty, "Output column name cannot be empty.")
+    outputColumn = value
+    this
+  }
+
+  def read(inputSource: String): DataFrame = {
+    if (validFile(inputSource)) {
+      val xmlDf = datasetWithTextFile(spark, inputSource)
+        .withColumn(outputColumn, parseXmlUDF(col("content")))
+      if (storeContent) xmlDf.select("path", "content", outputColumn)
+      else xmlDf.select("path", outputColumn)
+    } else throw new IllegalArgumentException(s"Invalid inputSource: $inputSource")
+  }
+
+  private val parseXmlUDF = udf((xml: String) => {
+    parseXml(xml)
+  })
+
+  def parseXml(xmlString: String): List[HTMLElement] = {
+    val xml = XML.loadString(xmlString)
+    val elements = ListBuffer[HTMLElement]()
+
+    def traverse(node: Node, parentId: Option[String]): Unit = {
+      node match {
+        case elem: Elem =>
+          val tagName = elem.label.toLowerCase
+          val textContent = elem.text.trim
+          val elementId = hash(tagName + textContent)
+
+          val isLeaf = !elem.child.exists(_.isInstanceOf[Elem])
+
+          if (!onlyLeafNodes || isLeaf) {
+            val elementType = tagName match {
+              case "title" | "author" => ElementType.TITLE
+              case _ => ElementType.UNCATEGORIZED_TEXT
+            }
+
+            val metadata = mutable.Map[String, String]("elementId" -> elementId)
+            if (xmlKeepTags) metadata += ("tag" -> tagName)
+            parentId.foreach(id => metadata += ("parentId" -> id))
+
+            val content = if (isLeaf) textContent else ""
+            elements += HTMLElement(elementType, content, metadata)
+          }
+
+          // Traverse children
+          elem.child.foreach(traverse(_, Some(elementId)))
+
+        case _ => // Ignore other types
+      }
+    }
+
+    traverse(xml, None)
+    elements.toList
+  }
+
+  def hash(s: String): String = {
+    java.security.MessageDigest
+      .getInstance("MD5")
+      .digest(s.getBytes)
+      .map("%02x".format(_))
+      .mkString
+  }
+
+}
diff --git a/src/test/resources/reader/xml/multi-level.xml b/src/test/resources/reader/xml/multi-level.xml
new file mode 100644
index 00000000000000..e14e5ad684be30
--- /dev/null
+++ b/src/test/resources/reader/xml/multi-level.xml
@@ -0,0 +1,20 @@
+<library>
+    <section name="Fiction">
+        <shelf number="1">
+            <book>
+                <title>The Alchemist</title>
+                <author>Paulo Coelho</author>
+                <year>1988</year>
+            </book>
+        </shelf>
+    </section>
+    <section name="Science">
+        <shelf number="2">
+            <book>
+                <title>A Brief History of Time</title>
+                <author>Stephen Hawking</author>
+                <year>1988</year>
+            </book>
+        </shelf>
+    </section>
+</library>
diff --git a/src/test/resources/reader/xml/test.xml b/src/test/resources/reader/xml/test.xml
new file mode 100644
index 00000000000000..44bdab910b4c96
--- /dev/null
+++ b/src/test/resources/reader/xml/test.xml
@@ -0,0 +1,14 @@
+<bookstore>
+    <book category="children">
+        <title lang="en">Harry Potter</title>
+        <author>J K. Rowling</author>
+        <year>2005</year>
+        <price>29.99</price>
+    </book>
+    <book category="web">
+        <title lang="en">Learning XML</title>
+        <author>Erik T. Ray</author>
+        <year>2003</year>
+        <price>39.95</price>
+    </book>
+</bookstore>
\ No newline at end of file
diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala
index 9937b95f59e512..05c5916c843424 100644
--- a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala
+++ b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala
@@ -32,6 +32,7 @@ class PartitionTest extends AnyFlatSpec {
   val emailDirectory = "src/test/resources/reader/email"
   val htmlDirectory = "src/test/resources/reader/html"
   val pdfDirectory = "src/test/resources/reader/pdf"
+  val xmlDirectory = "src/test/resources/reader/xml"
 
   "Partition" should "work with text content_type" taggedAs FastTest in {
     val textDf = Partition(Map("content_type" -> "text/plain")).partition(txtDirectory)
@@ -181,4 +182,11 @@ class PartitionTest extends AnyFlatSpec {
     assert(elements == expectedElements)
   }
 
+  it should "work with XML content_type" taggedAs FastTest in {
+    val pdfDf = Partition(Map("content_type" -> "application/xml")).partition(xmlDirectory)
+    pdfDf.show()
+
+    assert(!pdfDf.select(col("xml")).isEmpty)
+  }
+
 }
diff --git a/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala
new file mode 100644
index 00000000000000..a75537803e61de
--- /dev/null
+++ b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala
@@ -0,0 +1,43 @@
+package com.johnsnowlabs.reader
+
+import com.johnsnowlabs.tags.FastTest
+import org.apache.spark.sql.functions.{array_contains, col, explode, map_keys}
+import org.scalatest.flatspec.AnyFlatSpec
+
+class XMLReaderTest extends AnyFlatSpec {
+
+  val xmlFilesDirectory = "./src/test/resources/reader/xml/"
+
+  "XMLReader" should "read xml as dataframe" taggedAs FastTest in {
+    val XMLReader = new XMLReader()
+    val xmlDF = XMLReader.read(s"$xmlFilesDirectory/test.xml")
+    xmlDF.show(truncate = false)
+
+    assert(!xmlDF.select(col("xml").getItem(0)).isEmpty)
+    assert(!xmlDF.columns.contains("content"))
+  }
+
+  it should "include tags in the output" taggedAs FastTest in {
+    val XMLReader = new XMLReader(xmlKeepTags = true)
+    val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml")
+    xmlDF.show(truncate = false)
+
+    val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml")))
+    val tagsDf = explodedDf.filter(col("xml_exploded.metadata")("tag") =!= "")
+
+    assert(tagsDf.count() > 0)
+  }
+
+  it should "output all nodes" taggedAs FastTest in {
+    val XMLReader = new XMLReader(onlyLeafNodes = false)
+    val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml")
+    xmlDF.show(truncate = false)
+    val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml")))
+
+    val noParentIdCount = explodedDf
+      .filter(!array_contains(map_keys(col("xml_exploded.metadata")), "parentId"))
+
+    assert(noParentIdCount.count() > 0)
+  }
+
+}