From 52f1be61a74de68150a6b26079458d41e110884d Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 9 Nov 2022 13:04:24 -0600 Subject: [PATCH 1/3] add sql guide --- docs/source/_toctree.yml | 4 + docs/source/how_to.md | 3 +- docs/source/tabular_load.mdx | 209 +++++++++++++++++++++++++++++++++++ 3 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 docs/source/tabular_load.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index e52e9050bba..e89e521afc4 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -73,6 +73,10 @@ - local: nlp_process title: Process text data title: "Text" + - sections: + - local: tabular_load + title: Load tabular data + title: "Tabular" - sections: - local: share title: Share diff --git a/docs/source/how_to.md b/docs/source/how_to.md index 13e66a807ac..7e6cf8f719e 100644 --- a/docs/source/how_to.md +++ b/docs/source/how_to.md @@ -10,12 +10,13 @@ Interested in learning more? Take a look at [Chapter 5](https://huggingface.co/c -The guides are organized into five sections: +The guides are organized into six sections: - General usage: Functions for general dataset loading and processing. The functions shown in this section are applicable across all dataset modalities. - Audio: How to load, process, and share audio datasets. - Vision: How to load, process, and share image datasets. - Text: How to load, process, and share text datasets. +- Tabular: How to load, process, and share tabular datasets. - Dataset repository: How to share and upload a dataset to the Hub. If you have any questions about 🤗 Datasets, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/datasets/10). diff --git a/docs/source/tabular_load.mdx b/docs/source/tabular_load.mdx new file mode 100644 index 00000000000..95e8213694d --- /dev/null +++ b/docs/source/tabular_load.mdx @@ -0,0 +1,209 @@ +# Load tabular data + +Many real-world datasets are stored in databases which are typically accessed by SQL queries. With 🤗 Datasets, you can connect to a database, query for the data you need, and create a dataset out of it. Then you can use all the processing features of 🤗 Datasets to prepare your dataset for training. + +This guide will show you how to connect to SQLite and PostgreSQL and: + +- Load an entire table. +- Load from a SQL query. + +Check out this [notebook](https://colab.research.google.com/github/nateraw/huggingface-hub-examples/blob/main/sql_with_huggingface_datasets.ipynb) for a hands-on example! + +## SQLite + +SQLite is a small, lightweight database that is fast and easy to set up. You can use an existing database if you'd like, or follow along and start from scratch. + +Start by creating a quick SQLite database with this [Covid-19 data](https://github.com/nytimes/covid-19-data/blob/master/us-states.csv) from the New York Times: + +```py +>>> import sqlite3 +>>> import pandas as pd + +>>> conn = sqlite3.connect("us_covid_data.db") +>>> df = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv") +>>> df.to_sql("states", conn, if_exists="replace") +``` + +This creates a `states` table in the `us_covid_data.db` database which you can now load into a dataset. + +### Entire table + +To connect to the database, you'll need the [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) that identifies your database. The URI string differs for each database dialect, so be sure to check the [Database URLs](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) for whichever database you're using. + +For SQLite, it is: + +```py +>>> uri = "sqlite:///us_covid_data.db" +``` + +Load the table by passing the table name and URI to [`~datasets.Dataset.from_sql`]: + +```py +>>> from datasets import Dataset + +>>> ds = Dataset.from_sql("states", uri) +>>> ds +Dataset({ + features: ['index', 'date', 'state', 'fips', 'cases', 'deaths'], + num_rows: 54382 +}) +``` + +Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example: + +```py +>>> ds.filter(lambda x: x["state"] == "California") +``` + +### SQL query + +You can also load a dataset from a SQL query instead of an entire table, which is useful for querying and joining multiple tables. + +Load the dataset by passing your query and URI to [`~datasets.Dataset.from_sql`]: + +```py +>>> from datasets import Dataset + +>>> ds = Dataset.from_sql('SELECT * FROM states WHERE state="California";', uri) +>>> ds +Dataset({ + features: ['index', 'date', 'state', 'fips', 'cases', 'deaths'], + num_rows: 1019 +}) +``` + +Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example: + +```py +>>> ds.filter(lambda x: x["cases"] > 10000) +``` + +## PostgreSQL + + + +This example is designed to only run in a Google Colab. Be careful if you want to run the server commands locally! + + + +PostgreSQL is a popular open-source database. You can use an existing database if you'd like, or follow along and start from scratch. + +Start by installing the PostgreSQL server and set up an empty database and password: + +```py +# Install postgresql server +!sudo apt-get -y -qq update +!sudo apt-get -y -qq install postgresql +!sudo service postgresql start + +# Setup a password `postgres` for username `postgres` +!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'postgres';" + +# Setup a database with name `hfds_demo` to be used +!sudo -u postgres psql -U postgres -c 'DROP DATABASE IF EXISTS hfds_demo;' +!sudo -u postgres psql -U postgres -c 'CREATE DATABASE hfds_demo;' +``` + +Set up the environment variables: + +```py +%env POSTGRES_DB_NAME=hfds_demo +%env POSTGRES_DB_HOST=localhost +%env POSTGRES_DB_PORT=5432 +%env POSTGRES_DB_USER=postgres +%env POSTGRES_DB_PASS=postgres +``` + +Then you can load the [Air Quality Data Set](https://archive.ics.uci.edu/ml/datasets/Air+Quality) from the UCI Machine Learning Repository into your newly created database: + +```py +!curl -s -OL https://github.com/tensorflow/io/raw/master/docs/tutorials/postgresql/AirQualityUCI.sql + +!PGPASSWORD=$POSTGRES_DB_PASS psql -q -h $POSTGRES_DB_HOST -p $POSTGRES_DB_PORT -U $POSTGRES_DB_USER -d $POSTGRES_DB_NAME -f AirQualityUCI.sql +``` + +### Entire table + +To connect to the database, you'll need the [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) that identifies your database. The URI string differs for each database dialect, so be sure to check the [Database URLs](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) for whichever database you're using. + +For PostgreSQL, it is: + +```py +>>> import os + +>>> postgres_uri = "postgresql://{}:{}@{}?port={}&dbname={}".format( +... os.environ['POSTGRES_DB_USER'], +... os.environ['POSTGRES_DB_PASS'], +... os.environ['POSTGRES_DB_HOST'], +... os.environ['POSTGRES_DB_PORT'], +... os.environ['POSTGRES_DB_NAME'], +... ) +>>> postgres_uri +'postgresql://postgres:postgres@localhost?port=5432&dbname=hfds_demo' +``` + +The Air Quality Data Set table can't be loaded directly because 🤗 Datasets can't figure out how to cast some of the columns to their correct underlying feature types. You can fix this issue by [specifying your own features](loading#specify-features): + +```py +>>> from datasets import Value, Features + +>>> features = Features({ +... 'date': Value('date32'), +... 'time': Value('string'), +... 'co': Value('float32'), +... 'pt08s1': Value('int32'), +... 'nmhc': Value('float32'), +... 'c6h6': Value('float32'), +... 'pt08s2': Value('int32'), +... 'nox': Value('float32'), +... 'pt08s3': Value('int32'), +... 'no2': Value('float32'), +... 'pt08s4': Value('int32'), +... 'pt08s5': Value('int32'), +... 't': Value('float32'), +... 'rh': Value('float32'), +... 'ah': Value('float32'), +... }) +``` + +Now load the table by passing the table name, URI and features to [`~datasets.Dataset.from_sql`]: + +```py +>>> from datasets import Dataset + +>>> ds = Dataset.from_sql("airqualityuci", postgres_uri, features=features) +>>> ds +Dataset({ + features: ['date', 'time', 'co', 'pt08s1', 'nmhc', 'c6h6', 'pt08s2', 'nox', 'pt08s3', 'no2', 'pt08s4', 'pt08s5', 't', 'rh', 'ah'], + num_rows: 9357 +}) +``` + +Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example: + +```py +>>> ds.filter(lambda x: x["co"] > 3) +``` + +### SQL query + +You can also load a dataset from a SQL query instead of an entire table, which is useful for querying and joining multiple tables. + +Load the dataset by passing your query and URI to [`~datasets.Dataset.from_sql`]: + +```py +>>> from datasets import Dataset + +>>> ds = Dataset.from_sql('SELECT date, co FROM AirQualityUCI WHERE co > 3;', postgres_uri) +>>> ds +Dataset({ + features: ['date', 'co'], + num_rows: 1715 +}) +``` + +Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example: + +```py +>>> ds.filter(lambda x: x["co"] > 5) +``` \ No newline at end of file From fe87bc5d95023d9079fd7e96209781d1cac03778 Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 9 Nov 2022 15:59:00 -0600 Subject: [PATCH 2/3] remove duplicate section headers --- docs/source/tabular_load.mdx | 8 -------- 1 file changed, 8 deletions(-) diff --git a/docs/source/tabular_load.mdx b/docs/source/tabular_load.mdx index 95e8213694d..4c7920a8cf0 100644 --- a/docs/source/tabular_load.mdx +++ b/docs/source/tabular_load.mdx @@ -26,8 +26,6 @@ Start by creating a quick SQLite database with this [Covid-19 data](https://gith This creates a `states` table in the `us_covid_data.db` database which you can now load into a dataset. -### Entire table - To connect to the database, you'll need the [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) that identifies your database. The URI string differs for each database dialect, so be sure to check the [Database URLs](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) for whichever database you're using. For SQLite, it is: @@ -55,8 +53,6 @@ Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset. >>> ds.filter(lambda x: x["state"] == "California") ``` -### SQL query - You can also load a dataset from a SQL query instead of an entire table, which is useful for querying and joining multiple tables. Load the dataset by passing your query and URI to [`~datasets.Dataset.from_sql`]: @@ -122,8 +118,6 @@ Then you can load the [Air Quality Data Set](https://archive.ics.uci.edu/ml/data !PGPASSWORD=$POSTGRES_DB_PASS psql -q -h $POSTGRES_DB_HOST -p $POSTGRES_DB_PORT -U $POSTGRES_DB_USER -d $POSTGRES_DB_NAME -f AirQualityUCI.sql ``` -### Entire table - To connect to the database, you'll need the [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) that identifies your database. The URI string differs for each database dialect, so be sure to check the [Database URLs](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) for whichever database you're using. For PostgreSQL, it is: @@ -185,8 +179,6 @@ Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset. >>> ds.filter(lambda x: x["co"] > 3) ``` -### SQL query - You can also load a dataset from a SQL query instead of an entire table, which is useful for querying and joining multiple tables. Load the dataset by passing your query and URI to [`~datasets.Dataset.from_sql`]: From 830efb6af7a342683830f7bee28002aa32bf68ea Mon Sep 17 00:00:00 2001 From: Steven Date: Mon, 14 Nov 2022 16:15:34 -0600 Subject: [PATCH 3/3] apply reviews --- docs/source/loading.mdx | 54 +++------- docs/source/tabular_load.mdx | 184 ++++++++++++----------------------- 2 files changed, 73 insertions(+), 165 deletions(-) diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index 3dd51e98639..0a049f74caf 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -106,39 +106,18 @@ Datasets can be loaded from local files stored on your computer and from remote ### CSV -🤗 Datasets can read a dataset made up of one or several CSV files: +🤗 Datasets can read a dataset made up of one or several CSV files (in this case, pass your CSV files as a list): ```py >>> from datasets import load_dataset >>> dataset = load_dataset("csv", data_files="my_file.csv") ``` -If you have more than one CSV file: - -```py ->>> dataset = load_dataset("csv", data_files=["my_file_1.csv", "my_file_2.csv", "my_file_3.csv"]) -``` - -You can also map the training and test splits to specific CSV files: - -```py ->>> dataset = load_dataset("csv", data_files={"train": ["my_train_file_1.csv", "my_train_file_2.csv"], "test": "my_test_file.csv"}) -``` - -To load remote CSV files via HTTP, pass the URLs instead: - -```py ->>> base_url = "https://huggingface.co/datasets/lhoestq/demo1/resolve/main/data/" ->>> dataset = load_dataset('csv', data_files={'train': base_url + 'train.csv', 'test': base_url + 'test.csv'}) -``` + -To load zipped CSV files: +For more details, check out the [how to load tabular datasets from CSV files](tabular_load#csv-files) guide. -```py ->>> url = "https://domain.org/train_data.zip" ->>> data_files = {"train": url} ->>> dataset = load_dataset("csv", data_files=data_files) -``` + ### JSON @@ -198,28 +177,19 @@ To load remote Parquet files via HTTP, pass the URLs instead: ### SQL -Read database contents with [`Dataset.from_sql`]. Both table names and queries are supported. - -For example, a table from a SQLite file can be loaded with: +Read database contents with [`~datasets.Dataset.from_sql`] by specifying the URI to connect to your database. You can read both table names and queries: ```py >>> from datasets import Dataset ->>> dataset = Dataset.from_sql("data_table", "sqlite:///sqlite_file.db") -``` - -Use a query for a more precise read: - -```py ->>> from sqlite3 import connect ->>> con = connect(":memory") ->>> # db writes ... ->>> from datasets import Dataset ->>> dataset = Dataset.from_sql("SELECT text FROM table WHERE length(text) > 100 LIMIT 10", con) +# load entire table +>>> dataset = Dataset.from_sql("data_table_name", con="sqlite:///sqlite_file.db") +# load from query +>>> dataset = Dataset.from_sql("SELECT text FROM table WHERE length(text) > 100 LIMIT 10", con="sqlite:///sqlite_file.db") ``` -You can specify [`Dataset.from_sql#con`] as a [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) for the 🤗 Datasets caching to work across sessions. +For more details, check out the [how to load tabular datasets from SQL databases](tabular_load#databases) guide. @@ -273,9 +243,9 @@ Load Pandas DataFrames with [`~Dataset.from_pandas`]: >>> dataset = Dataset.from_pandas(df) ``` - + -An object data type in [pandas.Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html) doesn't always carry enough information for Arrow to automatically infer a data type. For example, if a DataFrame is of length `0` or the Series only contains `None/NaN` objects, the type is set to `null`. Avoid potential errors by constructing an explicit schema with [`Features`] using the `from_dict` or `from_pandas` methods. See the [troubleshoot](./loading#specify-features) section for more details on how to explicitly specify your own features. +For more details, check out the [how to load tabular datasets from Pandas DataFrames](tabular_load#pandas-dataframes) guide. diff --git a/docs/source/tabular_load.mdx b/docs/source/tabular_load.mdx index 4c7920a8cf0..165dee7d622 100644 --- a/docs/source/tabular_load.mdx +++ b/docs/source/tabular_load.mdx @@ -1,182 +1,114 @@ # Load tabular data -Many real-world datasets are stored in databases which are typically accessed by SQL queries. With 🤗 Datasets, you can connect to a database, query for the data you need, and create a dataset out of it. Then you can use all the processing features of 🤗 Datasets to prepare your dataset for training. +A tabular dataset is a generic dataset used to describe any data stored in rows and columns, where the rows represent an example and the columns represent a feature (can be continuous or categorical). These datasets are commonly stored in CSV files, Pandas DataFrames, and in database tables. This guide will show you how to load and create a tabular dataset from: -This guide will show you how to connect to SQLite and PostgreSQL and: +- CSV files +- Pandas DataFrames +- Databases -- Load an entire table. -- Load from a SQL query. +## CSV files -Check out this [notebook](https://colab.research.google.com/github/nateraw/huggingface-hub-examples/blob/main/sql_with_huggingface_datasets.ipynb) for a hands-on example! - -## SQLite - -SQLite is a small, lightweight database that is fast and easy to set up. You can use an existing database if you'd like, or follow along and start from scratch. - -Start by creating a quick SQLite database with this [Covid-19 data](https://github.com/nytimes/covid-19-data/blob/master/us-states.csv) from the New York Times: +🤗 Datasets can read CSV files by specifying the generic `csv` dataset script in the [`~datasets.load_dataset`] method. To load more than one CSV file, pass them as a list to the `data_files` parameter: ```py ->>> import sqlite3 ->>> import pandas as pd +>>> from datasets import load_dataset +>>> dataset = load_dataset("csv", data_files="my_file.csv") ->>> conn = sqlite3.connect("us_covid_data.db") ->>> df = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv") ->>> df.to_sql("states", conn, if_exists="replace") +# load multiple CSV files +>>> dataset = load_dataset("csv", data_files=["my_file_1.csv", "my_file_2.csv", "my_file_3.csv"]) ``` -This creates a `states` table in the `us_covid_data.db` database which you can now load into a dataset. - -To connect to the database, you'll need the [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) that identifies your database. The URI string differs for each database dialect, so be sure to check the [Database URLs](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) for whichever database you're using. - -For SQLite, it is: +You can also map specific CSV files to the train and test splits: ```py ->>> uri = "sqlite:///us_covid_data.db" +>>> dataset = load_dataset("csv", data_files={"train": ["my_train_file_1.csv", "my_train_file_2.csv"], "test": "my_test_file.csv"}) ``` -Load the table by passing the table name and URI to [`~datasets.Dataset.from_sql`]: +To load remote CSV files, pass the URLs instead: ```py ->>> from datasets import Dataset - ->>> ds = Dataset.from_sql("states", uri) ->>> ds -Dataset({ - features: ['index', 'date', 'state', 'fips', 'cases', 'deaths'], - num_rows: 54382 -}) +>>> base_url = "https://huggingface.co/datasets/lhoestq/demo1/resolve/main/data/" +>>> dataset = load_dataset('csv', data_files={"train": base_url + "train.csv", "test": base_url + "test.csv"}) ``` -Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example: +To load zipped CSV files: ```py ->>> ds.filter(lambda x: x["state"] == "California") +>>> url = "https://domain.org/train_data.zip" +>>> data_files = {"train": url} +>>> dataset = load_dataset("csv", data_files=data_files) ``` -You can also load a dataset from a SQL query instead of an entire table, which is useful for querying and joining multiple tables. +## Pandas DataFrames -Load the dataset by passing your query and URI to [`~datasets.Dataset.from_sql`]: +🤗 Datasets also supports loading datasets from [Pandas DataFrames](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) with the [`~datasets.Dataset.from_pandas`] method: ```py >>> from datasets import Dataset +>>> import pandas as pd ->>> ds = Dataset.from_sql('SELECT * FROM states WHERE state="California";', uri) ->>> ds -Dataset({ - features: ['index', 'date', 'state', 'fips', 'cases', 'deaths'], - num_rows: 1019 -}) +# create a Pandas DataFrame +>>> df = pd.read_csv("https://huggingface.co/datasets/imodels/credit-card/raw/main/train.csv") +>>> df = pd.DataFrame(df) +# load Dataset from Pandas DataFrame +>>> dataset = Dataset.from_pandas(df) ``` -Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example: +Use the `splits` parameter to specify the name of the dataset split: ```py ->>> ds.filter(lambda x: x["cases"] > 10000) +>>> train_ds = Dataset.from_pandas(train_df, split="train") +>>> test_ds = Dataset.from_pandas(test_df, split="test") ``` -## PostgreSQL +If the dataset doesn't look as expected, you should explicitly [specify your dataset features](loading#specify-features). A [pandas.Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html) may not always carry enough information for Arrow to automatically infer a data type. For example, if a DataFrame is of length `0` or if the Series only contains `None/NaN` objects, the type is set to `null`. - +## Databases -This example is designed to only run in a Google Colab. Be careful if you want to run the server commands locally! +Datasets stored in databases are typically accessed with SQL queries. With 🤗 Datasets, you can connect to a database, query for the data you need, and create a dataset out of it. Then you can use all the processing features of 🤗 Datasets to prepare your dataset for training. - +### SQLite -PostgreSQL is a popular open-source database. You can use an existing database if you'd like, or follow along and start from scratch. - -Start by installing the PostgreSQL server and set up an empty database and password: - -```py -# Install postgresql server -!sudo apt-get -y -qq update -!sudo apt-get -y -qq install postgresql -!sudo service postgresql start - -# Setup a password `postgres` for username `postgres` -!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'postgres';" - -# Setup a database with name `hfds_demo` to be used -!sudo -u postgres psql -U postgres -c 'DROP DATABASE IF EXISTS hfds_demo;' -!sudo -u postgres psql -U postgres -c 'CREATE DATABASE hfds_demo;' -``` - -Set up the environment variables: - -```py -%env POSTGRES_DB_NAME=hfds_demo -%env POSTGRES_DB_HOST=localhost -%env POSTGRES_DB_PORT=5432 -%env POSTGRES_DB_USER=postgres -%env POSTGRES_DB_PASS=postgres -``` +SQLite is a small, lightweight database that is fast and easy to set up. You can use an existing database if you'd like, or follow along and start from scratch. -Then you can load the [Air Quality Data Set](https://archive.ics.uci.edu/ml/datasets/Air+Quality) from the UCI Machine Learning Repository into your newly created database: +Start by creating a quick SQLite database with this [Covid-19 data](https://github.com/nytimes/covid-19-data/blob/master/us-states.csv) from the New York Times: ```py -!curl -s -OL https://github.com/tensorflow/io/raw/master/docs/tutorials/postgresql/AirQualityUCI.sql +>>> import sqlite3 +>>> import pandas as pd -!PGPASSWORD=$POSTGRES_DB_PASS psql -q -h $POSTGRES_DB_HOST -p $POSTGRES_DB_PORT -U $POSTGRES_DB_USER -d $POSTGRES_DB_NAME -f AirQualityUCI.sql +>>> conn = sqlite3.connect("us_covid_data.db") +>>> df = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv") +>>> df.to_sql("states", conn, if_exists="replace") ``` -To connect to the database, you'll need the [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) that identifies your database. The URI string differs for each database dialect, so be sure to check the [Database URLs](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) for whichever database you're using. - -For PostgreSQL, it is: +This creates a `states` table in the `us_covid_data.db` database which you can now load into a dataset. -```py ->>> import os - ->>> postgres_uri = "postgresql://{}:{}@{}?port={}&dbname={}".format( -... os.environ['POSTGRES_DB_USER'], -... os.environ['POSTGRES_DB_PASS'], -... os.environ['POSTGRES_DB_HOST'], -... os.environ['POSTGRES_DB_PORT'], -... os.environ['POSTGRES_DB_NAME'], -... ) ->>> postgres_uri -'postgresql://postgres:postgres@localhost?port=5432&dbname=hfds_demo' -``` +To connect to the database, you'll need the [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) that identifies your database. Connecting to a database with a URI caches the returned dataset. The URI string differs for each database dialect, so be sure to check the [Database URLs](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) for whichever database you're using. -The Air Quality Data Set table can't be loaded directly because 🤗 Datasets can't figure out how to cast some of the columns to their correct underlying feature types. You can fix this issue by [specifying your own features](loading#specify-features): +For SQLite, it is: ```py ->>> from datasets import Value, Features - ->>> features = Features({ -... 'date': Value('date32'), -... 'time': Value('string'), -... 'co': Value('float32'), -... 'pt08s1': Value('int32'), -... 'nmhc': Value('float32'), -... 'c6h6': Value('float32'), -... 'pt08s2': Value('int32'), -... 'nox': Value('float32'), -... 'pt08s3': Value('int32'), -... 'no2': Value('float32'), -... 'pt08s4': Value('int32'), -... 'pt08s5': Value('int32'), -... 't': Value('float32'), -... 'rh': Value('float32'), -... 'ah': Value('float32'), -... }) +>>> uri = "sqlite:///us_covid_data.db" ``` -Now load the table by passing the table name, URI and features to [`~datasets.Dataset.from_sql`]: +Load the table by passing the table name and URI to [`~datasets.Dataset.from_sql`]: ```py >>> from datasets import Dataset ->>> ds = Dataset.from_sql("airqualityuci", postgres_uri, features=features) +>>> ds = Dataset.from_sql("states", uri) >>> ds Dataset({ - features: ['date', 'time', 'co', 'pt08s1', 'nmhc', 'c6h6', 'pt08s2', 'nox', 'pt08s3', 'no2', 'pt08s4', 'pt08s5', 't', 'rh', 'ah'], - num_rows: 9357 + features: ['index', 'date', 'state', 'fips', 'cases', 'deaths'], + num_rows: 54382 }) ``` Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example: ```py ->>> ds.filter(lambda x: x["co"] > 3) +>>> ds.filter(lambda x: x["state"] == "California") ``` You can also load a dataset from a SQL query instead of an entire table, which is useful for querying and joining multiple tables. @@ -186,16 +118,22 @@ Load the dataset by passing your query and URI to [`~datasets.Dataset.from_sql`] ```py >>> from datasets import Dataset ->>> ds = Dataset.from_sql('SELECT date, co FROM AirQualityUCI WHERE co > 3;', postgres_uri) +>>> ds = Dataset.from_sql('SELECT * FROM states WHERE state="California";', uri) >>> ds Dataset({ - features: ['date', 'co'], - num_rows: 1715 + features: ['index', 'date', 'state', 'fips', 'cases', 'deaths'], + num_rows: 1019 }) ``` Then you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example: ```py ->>> ds.filter(lambda x: x["co"] > 5) -``` \ No newline at end of file +>>> ds.filter(lambda x: x["cases"] > 10000) +``` + +### PostgreSQL + +You can also connect and load a dataset from a PostgreSQL database, however we won't directly demonstrate how in the documentation because the example is only meant to be run in a notebook. Instead, take a look at how to install and setup a PostgreSQL server in this [notebook](https://colab.research.google.com/github/nateraw/huggingface-hub-examples/blob/main/sql_with_huggingface_datasets.ipynb#scrollTo=d83yGQMPHGFi)! + +After you've setup your PostgreSQL database, you can use the [`~datasets.Dataset.from_sql`] method to load a dataset from a table or query. \ No newline at end of file