diff --git a/.coveragerc b/.coveragerc index a0523ed4a49..cb53bfdaf96 100644 --- a/.coveragerc +++ b/.coveragerc @@ -9,6 +9,7 @@ include = dns/* datastore/* error_reporting/* + language/* managed_vms/* monitoring/* speech/* diff --git a/conftest.py b/conftest.py index 14876c24eac..3fa68de6953 100644 --- a/conftest.py +++ b/conftest.py @@ -15,9 +15,10 @@ import os import pytest +import requests -class Namespace: +class Namespace(object): def __init__(self, **kwargs): self.__dict__.update(kwargs) @@ -48,3 +49,24 @@ def resource(request): testing resource""" local_path = os.path.dirname(request.module.__file__) return lambda *args: get_resource_path(args, local_path) + + +def fetch_gcs_resource(resource, tmpdir, _chunk_size=1024): + resp = requests.get(resource, stream=True) + dest_file = str(tmpdir.join(os.path.basename(resource))) + with open(dest_file, 'wb') as f: + for chunk in resp.iter_content(_chunk_size): + f.write(chunk) + + return dest_file + + +@pytest.fixture(scope='module') +def remote_resource(cloud_config): + """Provides a function that downloads the given resource from Cloud + Storage, returning the path to the downloaded resource.""" + remote_uri = 'http://storage.googleapis.com/{}/'.format( + cloud_config.storage_bucket) + + return lambda path, tmpdir: fetch_gcs_resource( + remote_uri + path.strip('/'), tmpdir) diff --git a/language/README.md b/language/README.md new file mode 100644 index 00000000000..130ce66ff83 --- /dev/null +++ b/language/README.md @@ -0,0 +1,14 @@ +# Google Cloud Natural Language API examples + +This directory contains Python examples that use the +[Google Cloud Natural Language API](https://cloud.google.com/natural-language/). + +- [api](api) has a simple command line tool that shows off the API's features. + +- [ocr_nl](ocr_nl) uses the [Cloud Vision API](https://cloud.google.com/vision/) +to extract text from images, then uses the NL API to extract entity information +from those texts, and stores the extracted information in a database in support +of further analysis and correlation. + +- [syntax_triples](syntax_triples) uses syntax analysis to find +subject-verb-object triples in a given piece of text. diff --git a/language/api/README.md b/language/api/README.md new file mode 100644 index 00000000000..9625df30c89 --- /dev/null +++ b/language/api/README.md @@ -0,0 +1,87 @@ + +# Google Cloud Natural Language API Sample + +This Python sample demonstrates the use of the [Google Cloud Natural Language API][NL-Docs] +for sentiment, entity, and syntax analysis. + +[NL-Docs]: https://cloud.google.com/natural-language/docs/ + +## Setup + +Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. Following those steps, make sure that you +[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), +and export the following environment variable: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json +``` + +## Run the sample + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +To run the example, install the necessary libraries using pip: + +```sh +$ pip install -r requirements.txt +``` + +Then, run the script: + +```sh +$ python analyze.py +``` + +where `` is one of: `entities`, `sentiment`, or `syntax`. + +The script will write to STDOUT the json returned from the API for the requested feature. + +For example, if you run: + +```sh +$ python analyze.py entities "Tom Sawyer is a book written by a guy known as Mark Twain." +``` + +You will see something like the following returned: + +``` +{ + "entities": [ + { + "salience": 0.49785897, + "mentions": [ + { + "text": { + "content": "Tom Sawyer", + "beginOffset": 0 + } + } + ], + "type": "PERSON", + "name": "Tom Sawyer", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/The_Adventures_of_Tom_Sawyer" + } + }, + { + "salience": 0.12209519, + "mentions": [ + { + "text": { + "content": "Mark Twain", + "beginOffset": 47 + } + } + ], + "type": "PERSON", + "name": "Mark Twain", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/Mark_Twain" + } + } + ], + "language": "en" +} +``` diff --git a/language/api/analyze.py b/language/api/analyze.py new file mode 100644 index 00000000000..73e892c354a --- /dev/null +++ b/language/api/analyze.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python + +# Copyright 2016 Google, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Analyzes text using the Google Cloud Natural Language API.""" + +import argparse +import json +import sys + +from googleapiclient import discovery +import httplib2 +from oauth2client.client import GoogleCredentials + + +def get_service(): + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + return discovery.build('language', 'v1beta1', http=http) + + +def get_native_encoding_type(): + """Returns the encoding type that matches Python's native strings.""" + if sys.maxunicode == 65535: + return 'UTF16' + else: + return 'UTF32' + + +def analyze_entities(text, encoding='UTF32'): + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'encodingType': encoding, + } + + service = get_service() + + request = service.documents().analyzeEntities(body=body) + response = request.execute() + + return response + + +def analyze_sentiment(text): + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + } + } + + service = get_service() + + request = service.documents().analyzeSentiment(body=body) + response = request.execute() + + return response + + +def analyze_syntax(text, encoding='UTF32'): + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'features': { + 'extract_syntax': True, + }, + 'encodingType': encoding, + } + + service = get_service() + + request = service.documents().annotateText(body=body) + response = request.execute() + + return response + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('command', choices=[ + 'entities', 'sentiment', 'syntax']) + parser.add_argument('text') + + args = parser.parse_args() + + if args.command == 'entities': + result = analyze_entities(args.text, get_native_encoding_type()) + elif args.command == 'sentiment': + result = analyze_sentiment(args.text) + elif args.command == 'syntax': + result = analyze_syntax(args.text, get_native_encoding_type()) + + print(json.dumps(result, indent=2)) diff --git a/language/api/analyze_test.py b/language/api/analyze_test.py new file mode 100644 index 00000000000..11b0d65d629 --- /dev/null +++ b/language/api/analyze_test.py @@ -0,0 +1,258 @@ +# Copyright 2016, Google, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import textwrap + +import analyze + + +def test_analyze_entities(): + result = analyze.analyze_entities( + 'Tom Sawyer is a book written by a guy known as Mark Twain.') + + assert result['language'] == 'en' + entities = result['entities'] + assert len(entities) + subject = entities[0] + assert subject['type'] == 'PERSON' + assert subject['name'].startswith('Tom') + + +def test_analyze_sentiment(capsys): + result = analyze.analyze_sentiment( + 'your face is really ugly and i hate it.') + + sentiment = result['documentSentiment'] + assert sentiment['polarity'] < 0 + assert sentiment['magnitude'] < 1 + + result = analyze.analyze_sentiment( + 'cheerio, mate - I greatly admire the pallor of your visage, and your ' + 'angle of repose leaves little room for improvement.') + + sentiment = result['documentSentiment'] + assert sentiment['polarity'] > 0 + assert sentiment['magnitude'] < 1 + + +def test_analyze_syntax(capsys): + result = analyze.analyze_syntax(textwrap.dedent(u'''\ + Keep away from people who try to belittle your ambitions. Small people + always do that, but the really great make you feel that you, too, can + become great. + - Mark Twain''')) + + assert len(result['tokens']) + first_token = result['tokens'][0] + assert first_token['text']['content'] == 'Keep' + assert first_token['partOfSpeech']['tag'] == 'VERB' + assert len(result['sentences']) > 1 + assert result['language'] == 'en' + + +def test_analyze_syntax_utf8(): + """Demonstrate the interpretation of the offsets when encoding=utf8. + + UTF8 is a variable-length encoding, where each character is at least 8 + bits. The offsets we get should be the index of the first byte of the + character. + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + byte_array = test_string.encode('utf8') + result = analyze.analyze_syntax(test_string, encoding='UTF8') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + offset = tokens[0]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+1].decode('utf8') == + tokens[0]['text']['content']) + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = tokens[1]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+2].decode('utf8') == + tokens[1]['text']['content']) + + assert tokens[2]['text']['content'] == u'\u0201' + offset = tokens[2]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+2].decode('utf8') == + tokens[2]['text']['content']) + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = tokens[3]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+4].decode('utf8') == + tokens[3]['text']['content']) + + # This demonstrates that the offset takes into account the variable-length + # characters before the target token. + assert tokens[4]['text']['content'] == u'b' + offset = tokens[4]['text'].get('beginOffset', 0) + # 'b' is only one byte long + assert (byte_array[offset:offset+1].decode('utf8') == + tokens[4]['text']['content']) + + +def test_analyze_syntax_utf16(): + """Demonstrate the interpretation of the offsets when encoding=utf16. + + UTF16 is a variable-length encoding, where each character is at least 16 + bits. The returned offsets will be the index of the first 2-byte character + of the token. + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + byte_array = test_string.encode('utf16') + # Remove the byte order marker, which the offsets don't account for + byte_array = byte_array[2:] + result = analyze.analyze_syntax(test_string, encoding='UTF16') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + # The offset is an offset into an array where each entry is 16 bits. Since + # we have an 8-bit array, the offsets should be doubled to index into our + # array. + offset = 2 * tokens[0]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[0]['text']['content']) + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = 2 * tokens[1]['text'].get('beginOffset', 0) + # A UTF16 character with a low codepoint is 16 bits (2 bytes) long, so + # slice out 2 bytes starting from the offset. Then interpret the bytes as + # utf16 for comparison. + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[1]['text']['content']) + + assert tokens[2]['text']['content'] == u'\u0201' + offset = 2 * tokens[2]['text'].get('beginOffset', 0) + # A UTF16 character with a low codepoint is 16 bits (2 bytes) long, so + # slice out 2 bytes starting from the offset. Then interpret the bytes as + # utf16 for comparison. + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[2]['text']['content']) + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = 2 * tokens[3]['text'].get('beginOffset', 0) + # A UTF16 character with a high codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret those bytes as + # utf16 for comparison. + assert (byte_array[offset:offset + 4].decode('utf16') == + tokens[3]['text']['content']) + + # This demonstrates that the offset takes into account the variable-length + # characters before the target token. + assert tokens[4]['text']['content'] == u'b' + offset = 2 * tokens[4]['text'].get('beginOffset', 0) + # Even though 'b' is only one byte long, utf16 still encodes it using 16 + # bits + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[4]['text']['content']) + + +def test_annotate_text_utf32(): + """Demonstrate the interpretation of the offsets when encoding=utf32. + + UTF32 is a fixed-length encoding, where each character is exactly 32 bits. + The returned offsets will be the index of the first 4-byte character + of the token. + + Python unicode objects index by the interpreted unicode character. This + means a given unicode character only ever takes up one slot in a unicode + string. This is equivalent to indexing into a UTF32 string, where all + characters are a fixed length and thus will only ever take up one slot. + + Thus, if you're indexing into a python unicode object, you can set + encoding to UTF32 to index directly into the unicode object (as opposed to + the byte arrays, as these examples do). + + Nonetheless, this test still demonstrates indexing into the byte array, for + consistency. Note that you could just index into the origin test_string + unicode object with the raw offset returned by the api (ie without + multiplying it by 4, as it is below). + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + byte_array = test_string.encode('utf32') + # Remove the byte order marker, which the offsets don't account for + byte_array = byte_array[4:] + result = analyze.analyze_syntax(test_string, encoding='UTF32') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + # The offset is an offset into an array where each entry is 32 bits. Since + # we have an 8-bit array, the offsets should be quadrupled to index into + # our array. + offset = 4 * tokens[0]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[0]['text']['content']) + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = 4 * tokens[1]['text'].get('beginOffset', 0) + # A UTF32 character with a low codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret the bytes as + # utf32 for comparison. + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[1]['text']['content']) + + assert tokens[2]['text']['content'] == u'\u0201' + offset = 4 * tokens[2]['text'].get('beginOffset', 0) + # A UTF32 character with a low codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret the bytes as + # utf32 for comparison. + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[2]['text']['content']) + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = 4 * tokens[3]['text'].get('beginOffset', 0) + # A UTF32 character with a high codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret those bytes as + # utf32 for comparison. + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[3]['text']['content']) + + # This demonstrates that the offset takes into account the variable-length + # characters before the target token. + assert tokens[4]['text']['content'] == u'b' + offset = 4 * tokens[4]['text'].get('beginOffset', 0) + # Even though 'b' is only one byte long, utf32 still encodes it using 32 + # bits + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[4]['text']['content']) + + +def test_annotate_text_utf32_directly_index_into_unicode(): + """Demonstrate using offsets directly, using encoding=utf32. + + See the explanation for test_annotate_text_utf32. Essentially, indexing + into a utf32 array is equivalent to indexing into a python unicode object. + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + result = analyze.analyze_syntax(test_string, encoding='UTF32') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + offset = tokens[0]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[0]['text']['content'] + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = tokens[1]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[1]['text']['content'] + + assert tokens[2]['text']['content'] == u'\u0201' + offset = tokens[2]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[2]['text']['content'] + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = tokens[3]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[3]['text']['content'] + + assert tokens[4]['text']['content'] == u'b' + offset = tokens[4]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[4]['text']['content'] diff --git a/language/api/requirements.txt b/language/api/requirements.txt new file mode 100644 index 00000000000..0b96c82ee4c --- /dev/null +++ b/language/api/requirements.txt @@ -0,0 +1 @@ +google-api-python-client==1.5.1 diff --git a/language/ocr_nl/README.md b/language/ocr_nl/README.md new file mode 100644 index 00000000000..189e9397901 --- /dev/null +++ b/language/ocr_nl/README.md @@ -0,0 +1,227 @@ + +# Using the Cloud Natural Language API to analyze image text found with Cloud Vision + +This example uses the [Cloud Vision API](https://cloud.google.com/vision/) to +detect text in images, then analyzes that text using the [Cloud NL (Natural +Language) API](https://cloud.google.com/natural-language/) to detect +[entities](https://cloud.google.com/natural-language/docs/basics#entity_analysis) +in the text. It stores the detected entity +information in an [sqlite3](https://www.sqlite.org) database, which may then be +queried. + +(This kind of analysis can be useful with scans of brochures and fliers, +invoices, and other types of company documents... or maybe just organizing your +memes). + +After the example script has analyzed a directory of images, it outputs some +information on the images' entities to STDOUT. You can also further query +the generated sqlite3 database. + +## Setup + +### Install sqlite3 as necessary + +The example requires that sqlite3 be installed. Most likely, sqlite3 is already +installed for you on your machine, but if not, you can find it +[here](https://www.sqlite.org/download.html). + +### Set Up to Authenticate With Your Project's Credentials + +* Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. +* Following those steps, make sure that you [Set Up a Service + Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), + and export the following environment variable: + + ``` + export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json + ``` +* This sample also requires that you [enable the Cloud Vision + API](https://console.cloud.google.com/apis/api/vision.googleapis.com/overview?project=_) + +## Running the example + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +To run the example, install the necessary libraries using pip: + +```sh +$ pip install -r requirements.txt +``` + +You must also be set up to authenticate with the Cloud APIs using your +project's service account credentials, as described above. + +Then, run the script on a directory of images to do the analysis, E.g.: + +```sh +$ python main.py --input_directory= +``` + +You can try this on a sample directory of images: + +```sh +$ curl -O http://storage.googleapis.com/python-docs-samples-tests/language/ocr_nl-images.zip +$ unzip ocr_nl-images.zip +$ python main.py --input_directory=images/ +``` + +## A walkthrough of the example and its results + +Let's take a look at what the example generates when run on the `images/` +sample directory, and how it does it. + +The script looks at each image file in the given directory, and uses the Vision +API's text detection capabilities (OCR) to find any text in each image. It +passes that info to the NL API, and asks it to detect [entities](xxx) in the +discovered text, then stores this information in a queryable database. + +To keep things simple, we're just passing to the NL API all the text found in a +given image, in one string. Note that sometimes this string can include +misinterpreted characters (if the image text was not very clear), or list words +"out of order" from how a human would interpret them. So, the text that is +actually passed to the NL API might not be quite what you would have predicted +with your human eyeballs. + +The Entity information returned by the NL API includes *type*, *name*, *salience*, +information about where in the text the given entity was found, and detected +language. It may also include *metadata*, including a link to a Wikipedia URL +that the NL API believes this entity maps to. See the +[documentation](https://cloud.google.com/natural-language/docs/) and the [API +reference pages](https://cloud.google.com/natural-language/reference/rest/v1beta1/Entity) +for more information about `Entity` fields. + +For example, if the NL API was given the sentence: + +``` +"Holmes and Watson walked over to the cafe." +``` + +it would return a response something like the following: + +``` +{ + "entities": [{ + "salience": 0.51629782, + "mentions": [{ + "text": { + "content": "Holmes", + "beginOffset": 0 + }}], + "type": "PERSON", + "name": "Holmes", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/Sherlock_Holmes" + }}, + { + "salience": 0.22334209, + "mentions": [{ + "text": { + "content": "Watson", + "beginOffset": 11 + }}], + "type": "PERSON", + "name": "Watson", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/Dr._Watson" + }}], + "language": "en" +} +``` + +Note that the NL API determined from context that "Holmes" was referring to +'Sherlock Holmes', even though the name "Sherlock" was not included. + +Note also that not all nouns in a given sentence are detected as Entities. An +Entity represents a phrase in the text that is a known entity, such as a person, +an organization, or location. The generic mention of a 'cafe' is not treated as +an entity in this sense. + +For each image file, we store its detected entity information (if any) in an +sqlite3 database. + +### Querying for information about the detected entities + +Once the detected entity information from all the images is stored in the +sqlite3 database, we can run some queries to do some interesting analysis. The +script runs a couple of such example query sets and outputs the result to STDOUT. + +The first set of queries outputs information about the top 15 most frequent +entity names found in the images, and the second outputs information about the +top 15 most frequent Wikipedia URLs found. + +For example, with the sample image set, note that the name 'Sherlock Holmes' is +found three times, but entities associated with the URL +http://en.wikipedia.org/wiki/Sherlock_Holmes are found four times; one of the +entity names was only "Holmes", but the NL API detected from context that it +referred to Sherlock Holmes. Similarly, you can see that mentions of 'Hive' and +'Spark' mapped correctly – given their context – to the URLs of those Apache +products. + +``` +----entity: http://en.wikipedia.org/wiki/Apache_Hive was found with count 1 +Found in file images/IMG_20160621_133020.jpg, detected as type OTHER, with + locale en. +names(s): set([u'hive']) +salience measure(s): set([0.0023808887]) +``` + +Similarly, 'Elizabeth' (in screencaps of text from "Pride and Prejudice") is +correctly mapped to http://en.wikipedia.org/wiki/Elizabeth_Bennet because of the +context of the surrounding text. + +``` +----entity: http://en.wikipedia.org/wiki/Elizabeth_Bennet was found with count 2 +Found in file images/Screenshot 2016-06-19 11.51.50.png, detected as type PERSON, with + locale en. +Found in file images/Screenshot 2016-06-19 12.08.30.png, detected as type PERSON, with + locale en. +names(s): set([u'elizabeth']) +salience measure(s): set([0.34601286, 0.0016268975]) +``` + +## Further queries to the sqlite3 database + +When the script runs, it makes a couple of example queries to the database +containing the entity information returned from the NL API. You can make further +queries on that database by starting up sqlite3 from the command line, and +passing it the name of the database file generated by running the example. This +file will be in the same directory, and have `entities` as a prefix, with the +timestamp appended. (If you have run the example more than once, a new database +file will be created each time). + +Run sqlite3 as follows (using the name of your own database file): + +```sh +$ sqlite3 entities1466518508.db +``` + +You'll see something like this: + +``` +SQLite version 3.8.10.2 2015-05-20 18:17:19 +Enter ".help" for usage hints. +sqlite> +``` + +From this prompt, you can make any queries on the data that you want. E.g., +start with something like: + +``` +sqlite> select * from entities limit 20; +``` + +Or, try this to see in which images the most entities were detected: + +``` +sqlite> select filename, count(filename) from entities group by filename; +``` + +You can do more complex queries to get further information about the entities +that have been discovered in your images. E.g., you might want to investigate +which of the entities are most commonly found together in the same image. See +the [SQLite documentation](https://www.sqlite.org/docs.html) for more +information. + + diff --git a/language/ocr_nl/main.py b/language/ocr_nl/main.py new file mode 100755 index 00000000000..6e329f53386 --- /dev/null +++ b/language/ocr_nl/main.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This example uses the Google Cloud Vision API to detect text in images, then +analyzes that text using the Google Cloud Natural Language API to detect +entities in the text. It stores the detected entity information in an sqlite3 +database, which may then be queried. + +After this script has analyzed a directory of images, it outputs some +information on the images' entities to STDOUT. You can also further query +the generated sqlite3 database; see the README for more information. + +Run the script on a directory of images to do the analysis, E.g.: + $ python main.py --input_directory= + +You can try this on a sample directory of images: + $ curl -O http://storage.googleapis.com/python-docs-samples-tests/language/ocr_nl-images.zip + $ unzip ocr_nl-images.zip + $ python main.py --input_directory=images/ + +""" # noqa + +import argparse +import base64 +import contextlib +import logging +import os +import sqlite3 +import sys +import time + +from googleapiclient import discovery +from googleapiclient import errors +import httplib2 +from oauth2client.client import GoogleCredentials + +BATCH_SIZE = 10 + + +class VisionApi(object): + """Construct and use the Cloud Vision API service.""" + + def __init__(self): + credentials = GoogleCredentials.get_application_default() + self.service = discovery.build('vision', 'v1', credentials=credentials) + + def detect_text(self, input_filenames, num_retries=3, max_results=6): + """Uses the Vision API to detect text in the given file.""" + batch_request = [] + for filename in input_filenames: + request = { + 'image': {}, + 'features': [{ + 'type': 'TEXT_DETECTION', + 'maxResults': max_results, + }] + } + + # Accept both files in cloud storage, as well as local files. + if filename.startswith('gs://'): + request['image']['source'] = { + 'gcsImageUri': filename + } + else: + with open(filename, 'rb') as image_file: + request['image']['content'] = base64.b64encode( + image_file.read()).decode('UTF-8') + + batch_request.append(request) + + request = self.service.images().annotate( + body={'requests': batch_request}) + + try: + responses = request.execute(num_retries=num_retries) + if 'responses' not in responses: + return {} + + text_response = {} + for filename, response in zip( + input_filenames, responses['responses']): + + if 'error' in response: + logging.error('API Error for {}: {}'.format( + filename, + response['error'].get('message', ''))) + continue + + text_response[filename] = response.get('textAnnotations', []) + + return text_response + + except errors.HttpError as e: + logging.error('Http Error for {}: {}'.format(filename, e)) + except KeyError as e2: + logging.error('Key error: {}'.format(e2)) + + +class TextAnalyzer(object): + """Construct and use the Google Natural Language API service.""" + + def __init__(self, db_filename=None): + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + self.service = discovery.build('language', 'v1beta1', http=http) + + # This list will store the entity information gleaned from the + # image files. + self.entity_info = [] + + # This is the filename of the sqlite3 database to save to + self.db_filename = db_filename or 'entities{}.db'.format( + int(time.time())) + + def _get_native_encoding_type(self): + """Returns the encoding type that matches Python's native strings.""" + if sys.maxunicode == 65535: + return 'UTF16' + else: + return 'UTF32' + + def nl_detect(self, text): + """Use the Natural Language API to analyze the given text string.""" + # We're only requesting 'entity' information from the Natural Language + # API at this time. + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'encodingType': self._get_native_encoding_type(), + } + entities = [] + try: + request = self.service.documents().analyzeEntities(body=body) + response = request.execute() + entities = response['entities'] + except errors.HttpError as e: + logging.error('Http Error: %s' % e) + except KeyError as e2: + logging.error('Key error: %s' % e2) + return entities + + def add_entities(self, filename, locale, document): + """Apply the Natural Language API to the document, and collect the + detected entities.""" + + # Apply the Natural Language API to the document. + entities = self.nl_detect(document) + self.extract_and_save_entity_info(entities, locale, filename) + + def extract_entity_info(self, entity): + """Extract information about an entity.""" + type = entity['type'] + name = entity['name'].lower() + metadata = entity['metadata'] + salience = entity['salience'] + wiki_url = metadata.get('wikipedia_url', None) + return (type, name, salience, wiki_url) + + def extract_and_save_entity_info(self, entities, locale, filename): + for entity in entities: + type, name, salience, wiki_url = self.extract_entity_info(entity) + # Because this is a small example, we're using a list to hold + # all the entity information, then we'll insert it into the + # database all at once when we've processed all the files. + # For a larger data set, you would want to write to the database + # in batches. + self.entity_info.append( + (locale, type, name, salience, wiki_url, filename)) + + def write_entity_info_to_db(self): + """Store the info gleaned about the entities in the text, via the + Natural Language API, in an sqlite3 database table, and then print out + some simple analytics. + """ + logging.info('Saving entity info to the sqlite3 database.') + # Create the db. + with contextlib.closing(sqlite3.connect(self.db_filename)) as conn: + with conn as cursor: + # Create table + cursor.execute( + 'CREATE TABLE if not exists entities (locale text, ' + 'type text, name text, salience real, wiki_url text, ' + 'filename text)') + with conn as cursor: + # Load all the data + cursor.executemany( + 'INSERT INTO entities VALUES (?,?,?,?,?,?)', + self.entity_info) + + def output_entity_data(self): + """Output some info about the entities by querying the generated + sqlite3 database. + """ + + with contextlib.closing(sqlite3.connect(self.db_filename)) as conn: + + # This query finds the number of times each entity name was + # detected, in descending order by count, and returns information + # about the first 15 names, including the files in which they were + # found, their detected 'salience' and language (locale), and the + # wikipedia urls (if any) associated with them. + print('\n==============\nTop 15 most frequent entity names:') + + cursor = conn.cursor() + results = cursor.execute( + 'select name, count(name) as wc from entities ' + 'group by name order by wc desc limit 15;') + + for item in results: + cursor2 = conn.cursor() + print(u'\n----Name: {} was found with count {}'.format(*item)) + results2 = cursor2.execute( + 'SELECT name, type, filename, locale, wiki_url, salience ' + 'FROM entities WHERE name=?', (item[0],)) + urls = set() + for elt in results2: + print(('Found in file {}, detected as type {}, with\n' + ' locale {} and salience {}.').format( + elt[2], elt[1], elt[3], elt[5])) + if elt[4]: + urls.add(elt[4]) + if urls: + print('url(s): {}'.format(urls)) + + # This query finds the number of times each wikipedia url was + # detected, in descending order by count, and returns information + # about the first 15 urls, including the files in which they were + # found and the names and 'salience' with which they were + # associated. + print('\n==============\nTop 15 most frequent Wikipedia URLs:') + c = conn.cursor() + results = c.execute( + 'select wiki_url, count(wiki_url) as wc from entities ' + 'group by wiki_url order by wc desc limit 15;') + + for item in results: + cursor2 = conn.cursor() + print('\n----entity: {} was found with count {}'.format(*item)) + results2 = cursor2.execute( + 'SELECT name, type, filename, locale, salience ' + 'FROM entities WHERE wiki_url=?', (item[0],)) + names = set() + salience = set() + for elt in results2: + print(('Found in file {}, detected as type {}, with\n' + ' locale {}.').format(elt[2], elt[1], elt[3])) + names.add(elt[0]) + salience.add(elt[4]) + print('names(s): {}'.format(names)) + print('salience measure(s): {}'.format(salience)) + + +def extract_description(texts): + """Returns text annotations as a single string""" + document = [] + + for text in texts: + try: + document.append(text['description']) + locale = text['locale'] + # Process only the first entry, which contains all + # text detected. + break + except KeyError as e: + logging.error('KeyError: %s\n%s' % (e, text)) + return (locale, ' '.join(document)) + + +def extract_descriptions(input_filename, texts, text_analyzer): + """Gets the text that was detected in the image.""" + if texts: + locale, document = extract_description(texts) + text_analyzer.add_entities(input_filename, locale, document) + sys.stdout.write('.') # Output a progress indicator. + sys.stdout.flush() + elif texts == []: + print('%s had no discernible text.' % input_filename) + + +def get_text_from_files(vision, input_filenames, text_analyzer): + """Call the Vision API on a file and index the results.""" + texts = vision.detect_text(input_filenames) + if texts: + for filename, text in texts.items(): + extract_descriptions(filename, text, text_analyzer) + + +def batch(list_to_batch, batch_size=BATCH_SIZE): + """Group a list into batches of size batch_size. + + >>> tuple(batch([1, 2, 3, 4, 5], batch_size=2)) + ((1, 2), (3, 4), (5)) + """ + for i in range(0, len(list_to_batch), batch_size): + yield tuple(list_to_batch[i:i + batch_size]) + + +def main(input_dir, db_filename=None): + """Walk through all the image files in the given directory, extracting any + text from them and feeding that text to the Natural Language API for + analysis. + """ + # Create a client object for the Vision API + vision_api_client = VisionApi() + # Create an object to analyze our text using the Natural Language API + text_analyzer = TextAnalyzer(db_filename) + + if input_dir: + allfileslist = [] + # Recursively construct a list of all the files in the given input + # directory. + for folder, subs, files in os.walk(input_dir): + for filename in files: + allfileslist.append(os.path.join(folder, filename)) + + # Analyze the text in the files using the Vision and Natural Language + # APIs. + for filenames in batch(allfileslist, batch_size=1): + get_text_from_files(vision_api_client, filenames, text_analyzer) + + # Save the result to a database, then run some queries on the database, + # with output to STDOUT. + text_analyzer.write_entity_info_to_db() + + # now, print some information about the entities detected. + text_analyzer.output_entity_data() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Detects text in the images in the given directory.') + parser.add_argument( + '--input_directory', + help='The image directory you\'d like to detect text in. If left ' + 'unspecified, the --db specified will be queried without being ' + 'updated.') + parser.add_argument( + '--db', help='The filename to use for the sqlite3 database.') + args = parser.parse_args() + + if not (args.input_directory or args.db): + parser.error('Either --input_directory or --db must be specified.') + + main(args.input_directory, args.db) diff --git a/language/ocr_nl/main_test.py b/language/ocr_nl/main_test.py new file mode 100755 index 00000000000..c07ed747ea0 --- /dev/null +++ b/language/ocr_nl/main_test.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for main.""" + +import re +import zipfile + +import main + + +_TEST_IMAGE_URI = 'gs://{}/language/image8.png' + + +def test_batch_empty(): + for batch_size in range(1, 10): + assert len( + list(main.batch([], batch_size=batch_size))) == 0 + + +def test_batch_single(): + for batch_size in range(1, 10): + batched = tuple(main.batch([1], batch_size=batch_size)) + assert batched == ((1,),) + + +def test_single_image_returns_text(cloud_config): + vision_api_client = main.VisionApi() + + image_path = _TEST_IMAGE_URI.format(cloud_config.storage_bucket) + texts = vision_api_client.detect_text([image_path]) + + assert image_path in texts + _, document = main.extract_description(texts[image_path]) + assert "daughter" in document + assert "Bennet" in document + assert "hat" in document + + +def test_single_nonimage_returns_error(): + vision_api_client = main.VisionApi() + texts = vision_api_client.detect_text(['README.md']) + assert "README.md" not in texts + + +def test_text_returns_entities(): + text = "Holmes and Watson walked to the cafe." + text_analyzer = main.TextAnalyzer() + entities = text_analyzer.nl_detect(text) + assert len(entities) == 2 + etype, ename, salience, wurl = text_analyzer.extract_entity_info( + entities[0]) + assert ename == 'holmes' + assert wurl == 'http://en.wikipedia.org/wiki/Sherlock_Holmes' + + +def test_entities_list(cloud_config): + vision_api_client = main.VisionApi() + image_path = _TEST_IMAGE_URI.format(cloud_config.storage_bucket) + texts = vision_api_client.detect_text([image_path]) + locale, document = main.extract_description(texts[image_path]) + text_analyzer = main.TextAnalyzer() + entities = text_analyzer.nl_detect(document) + assert len(entities) == 4 + etype, ename, salience, wurl = text_analyzer.extract_entity_info( + entities[0]) + assert ename == 'bennet' + assert wurl == 'http://en.wikipedia.org/wiki/Mr_Bennet' + + +def test_main(remote_resource, tmpdir, capsys): + images_path = str(tmpdir.mkdir('images')) + + # First, pull down some test data + zip_path = remote_resource('language/ocr_nl-images-small.zip', tmpdir) + + # Extract it to the image directory + with zipfile.ZipFile(zip_path) as zfile: + zfile.extractall(images_path) + + main.main(images_path, str(tmpdir.join('ocr_nl.db'))) + + stdout, _ = capsys.readouterr() + + assert re.search(r'google was found with count', stdout) diff --git a/language/ocr_nl/requirements.txt b/language/ocr_nl/requirements.txt new file mode 100644 index 00000000000..0b96c82ee4c --- /dev/null +++ b/language/ocr_nl/requirements.txt @@ -0,0 +1 @@ +google-api-python-client==1.5.1 diff --git a/language/syntax_triples/README.md b/language/syntax_triples/README.md new file mode 100644 index 00000000000..1342ee65289 --- /dev/null +++ b/language/syntax_triples/README.md @@ -0,0 +1,91 @@ +# Using the Cloud Natural Language API to find subject-verb-object triples in text + +This example finds subject-verb-object triples in a given piece of text using +syntax analysis capabilities of +[Cloud Natural Language API](https://cloud.google.com/natural-language/). +To do this, it calls the extractSyntax feature of the API +and uses the dependency parse tree and part-of-speech tags in the resposne +to build the subject-verb-object triples. The results are printed to STDOUT. +This type of analysis can be considered as the +first step towards an information extraction task. + +## Set Up to Authenticate With Your Project's Credentials + +Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. Following those steps, make sure that you +[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), +and export the following environment variable: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json +``` + +## Running the example + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +To run the example, install the necessary libraries using pip: + +``` +$ pip install -r requirements.txt +``` +You must also be set up to authenticate with the Cloud APIs using your +project's service account credentials, as described above. + +Then, run the script on a file containing the text that you wish to analyze. +The text must be encoded in UTF8 or ASCII: + +``` +$ python main.py +``` + +Try this on a sample text in the resources directory: + +``` +$ python main.py resources/obama_wikipedia.txt +``` + +## A walkthrough of the example and its results + +Let's take a look at what the example generates when run on the +`obama_wikipedia.txt` sample file, and how it does it. + +The goal is to find all subject-verb-object +triples in the text. The example first sends the text to the Cloud Natural +Language API to perform extractSyntax analysis. Then, using part-of-speech tags, + it finds all the verbs in the text. For each verb, it uses the dependency +parse tree information to find all the dependent tokens. + +For example, given the following sentence in the `obama_wikipedia.txt` file: + +``` +"He began his presidential campaign in 2007" +``` +The example finds the verb `began`, and `He`, `campaign`, and `in` as its +dependencies. Then the script enumerates the dependencies for each verb and +finds all the subjects and objects. For the sentence above, the found subject +and object are `He` and `campaign`. + +The next step is to complete each subject and object token by adding their +dependencies to them. For example, in the sentence above, `his` and +`presidential` are dependent tokens for `campaign`. This is done using the +dependency parse tree, similar to verb dependencies as explained above. The +final result is (`He`, `began`, `his presidential campaign`) triple for +the example sentence above. + +The script performs this analysis for the entire text and prints the result. +For the `obama_wikipedia.txt` file, the result is the following: + +```sh ++------------------------------+------------+------------------------------+ +| Obama | received | national attention | ++------------------------------+------------+------------------------------+ +| He | began | his presidential campaign | ++------------------------------+------------+------------------------------+ +| he | won | sufficient delegates in the | +| | | Democratic Party primaries | ++------------------------------+------------+------------------------------+ +| He | defeated | Republican nominee John | +| | | McCain | +``` diff --git a/language/syntax_triples/main.py b/language/syntax_triples/main.py new file mode 100644 index 00000000000..1be174bff04 --- /dev/null +++ b/language/syntax_triples/main.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This example finds subject-verb-object triples in a given piece of text using +the syntax analysis capabilities of Cloud Natural Language API. The triples are +printed to STDOUT. This can be considered as the first step towards an +information extraction task. + +Run the script on a file containing the text that you wish to analyze. +The text must be encoded in UTF8 or ASCII: + $ python main.py + +Try this on a sample text in the resources directory: + $ python main.py resources/obama_wikipedia.txt +""" + +import argparse +import sys +import textwrap + +from googleapiclient import discovery +import httplib2 +from oauth2client.client import GoogleCredentials + + +def dependents(tokens, head_index): + """Returns an ordered list of the token indices of the dependents for + the given head.""" + # Create head->dependency index. + head_to_deps = {} + for i, token in enumerate(tokens): + head = token['dependencyEdge']['headTokenIndex'] + if i != head: + head_to_deps.setdefault(head, []).append(i) + return head_to_deps.get(head_index, ()) + + +def phrase_text_for_head(tokens, text, head_index): + """Returns the entire phrase containing the head token + and its dependents. + """ + begin, end = phrase_extent_for_head(tokens, head_index) + return text[begin:end] + + +def phrase_extent_for_head(tokens, head_index): + """Returns the begin and end offsets for the entire phrase + containing the head token and its dependents. + """ + begin = tokens[head_index]['text']['beginOffset'] + end = begin + len(tokens[head_index]['text']['content']) + for child in dependents(tokens, head_index): + child_begin, child_end = phrase_extent_for_head(tokens, child) + begin = min(begin, child_begin) + end = max(end, child_end) + return (begin, end) + + +def analyze_syntax(text): + """Use the NL API to analyze the given text string, and returns the + response from the API. Requests an encodingType that matches + the encoding used natively by Python. Raises an + errors.HTTPError if there is a connection problem. + """ + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + service = discovery.build( + 'language', 'v1beta1', http=http) + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'features': { + 'extract_syntax': True, + }, + 'encodingType': get_native_encoding_type(), + } + request = service.documents().annotateText(body=body) + return request.execute() + + +def get_native_encoding_type(): + """Returns the encoding type that matches Python's native strings.""" + if sys.maxunicode == 65535: + return 'UTF16' + else: + return 'UTF32' + + +def find_triples(tokens, + left_dependency_label='NSUBJ', + head_part_of_speech='VERB', + right_dependency_label='DOBJ'): + """Generator function that searches the given tokens + with the given part of speech tag, that have dependencies + with the given labels. For each such head found, yields a tuple + (left_dependent, head, right_dependent), where each element of the + tuple is an index into the tokens array. + """ + for head, token in enumerate(tokens): + if token['partOfSpeech']['tag'] == head_part_of_speech: + children = dependents(tokens, head) + left_deps = [] + right_deps = [] + for child in children: + child_token = tokens[child] + child_dep_label = child_token['dependencyEdge']['label'] + if child_dep_label == left_dependency_label: + left_deps.append(child) + elif child_dep_label == right_dependency_label: + right_deps.append(child) + for left_dep in left_deps: + for right_dep in right_deps: + yield (left_dep, head, right_dep) + + +def show_triple(tokens, text, triple): + """Prints the given triple (left, head, right). For left and right, + the entire phrase headed by each token is shown. For head, only + the head token itself is shown. + + """ + nsubj, verb, dobj = triple + + # Extract the text for each element of the triple. + nsubj_text = phrase_text_for_head(tokens, text, nsubj) + verb_text = tokens[verb]['text']['content'] + dobj_text = phrase_text_for_head(tokens, text, dobj) + + # Pretty-print the triple. + left = textwrap.wrap(nsubj_text, width=28) + mid = textwrap.wrap(verb_text, width=10) + right = textwrap.wrap(dobj_text, width=28) + print('+' + 30 * '-' + '+' + 12 * '-' + '+' + 30 * '-' + '+') + for l, m, r in zip(left, mid, right): + print('| {:<28s} | {:<10s} | {:<28s} |'.format( + l or '', m or '', r or '')) + + +def main(text_file): + # Extracts subject-verb-object triples from the given text file, + # and print each one. + + # Read the input file. + text = open(text_file, 'rb').read().decode('utf8') + + analysis = analyze_syntax(text) + tokens = analysis.get('tokens', []) + + for triple in find_triples(tokens): + show_triple(tokens, text, triple) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + 'text_file', + help='A file containing the document to process. ' + 'Should be encoded in UTF8 or ASCII') + args = parser.parse_args() + main(args.text_file) diff --git a/language/syntax_triples/main_test.py b/language/syntax_triples/main_test.py new file mode 100755 index 00000000000..62c2915da02 --- /dev/null +++ b/language/syntax_triples/main_test.py @@ -0,0 +1,50 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import main + + +def test_dependents(): + text = "I am eating a delicious banana" + analysis = main.analyze_syntax(text) + tokens = analysis.get('tokens', []) + assert [0, 1, 5] == main.dependents(tokens, 2) + assert [3, 4] == main.dependents(tokens, 5) + + +def test_phrase_text_for_head(): + text = "A small collection of words" + analysis = main.analyze_syntax(text) + tokens = analysis.get('tokens', []) + assert "words" == main.phrase_text_for_head(tokens, text, 4) + + +def test_find_triples(): + text = "President Obama won the noble prize" + analysis = main.analyze_syntax(text) + tokens = analysis.get('tokens', []) + triples = main.find_triples(tokens) + for triple in triples: + assert (1, 2, 5) == triple + + +def test_obama_example(resource, capsys): + main.main(resource('obama_wikipedia.txt')) + stdout, _ = capsys.readouterr() + lines = stdout.split('\n') + assert re.match( + r'.*Obama\b.*\| received\b.*\| national attention\b', + lines[1]) diff --git a/language/syntax_triples/requirements.txt b/language/syntax_triples/requirements.txt new file mode 100644 index 00000000000..0b96c82ee4c --- /dev/null +++ b/language/syntax_triples/requirements.txt @@ -0,0 +1 @@ +google-api-python-client==1.5.1 diff --git a/language/syntax_triples/resources/obama_wikipedia.txt b/language/syntax_triples/resources/obama_wikipedia.txt new file mode 100644 index 00000000000..1e89d4ab081 --- /dev/null +++ b/language/syntax_triples/resources/obama_wikipedia.txt @@ -0,0 +1 @@ +In 2004, Obama received national attention during his campaign to represent Illinois in the United States Senate with his victory in the March Democratic Party primary, his keynote address at the Democratic National Convention in July, and his election to the Senate in November. He began his presidential campaign in 2007 and, after a close primary campaign against Hillary Clinton in 2008, he won sufficient delegates in the Democratic Party primaries to receive the presidential nomination. He then defeated Republican nominee John McCain in the general election, and was inaugurated as president on January 20, 2009. Nine months after his inauguration, Obama was named the 2009 Nobel Peace Prize laureate.