Wordadora/one.py at main · ddebrup/Wordadora · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import urllib.request as urllib2
from bs4 import BeautifulSoup
import re
import spacy
import nltk
import textstat
Newlines = re.compile(r'[\r\n]\s+')

def getPageText(url):
    # given a url, get page content
    data = urllib2.urlopen(url).read()
    # parse as html structured document
    bs = BeautifulSoup(data)
    # kill javascript content
    for s in bs.findAll('script'):
        s.replaceWith('')
    # find body and extract text
    txt = bs.find('body').getText('\n')
    # remove multiple linebreaks and whitespace
    return Newlines.sub('\n', txt)

def genwrd(txt):
    nlp = spacy.load('en_core_web_lg')


    #nlp = spacy.load('en_core_web_sm') #you can use other methods
    # excluded tags
    #excluded_tags = { "ADV", "ADP", "PROPN", "AUX", "CONJ", "DET", "NUM", "PART", "PRON", "PUNCT", "SCONJ", "SYM"}
    included_tags = {"ADJ", "INTJ", "NOUN", "VERB"}
    #document = [line.strip() for line in open(''.join(txt), encoding='utf8').readlines()]
    words=[]
    sent_text = nltk.sent_tokenize(''.join(txt).replace("\n",""))
    document = sent_text
    sentences = document[:10] #first 10 sentences
    #new_sentences = []
    for sentence in sentences:
        #new_sentence = []
        for token in nlp(sentence):
            if token.pos_ in included_tags:
                word=token.text
                if word not in words:
                    words.append(word)
                #new_sentence.append(token.text)
        #new_sentences.append(" ".join(new_sentence))
    return words

# Seperating individual words out
def seperate(words):
    final_Words=[]
    for word in words:
        if word.isalpha():
            final_Words.append(word)
    return final_Words

# Lemmatizing words
def Lemmatiz(Sep_words):
    from nltk.stem import WordNetLemmatizer
    words=[]
    lemmatizer = WordNetLemmatizer()
    for word in Sep_words:
        words.append(lemmatizer.lemmatize(word))
    return words


# Final Controller Module
def fin(words):
    word_list = []
    global wordle
    if 'wordle' not in globals():
        wordle = {}
    #excepted = []
    #definition = []
    #example = []
    from nltk.corpus import wordnet
    import textstat


    for word in words:
        syns = wordnet.synsets(word.lower())
        if not syns:
            continue
        else:
            #li=[]
            #word_list.append(word)
    #        #definition.append(syns[0].definition())
    #        #example.append(syns[0].examples()[0])
    #        li.append(syns[0].definition())

            #li.append(textstat.automated_readability_index(word.lower()))
	   # wordle[word]=textstat.flesch_reading_ease(word.lower())
            wordle[word]=textstat.automated_readability_index(word.lower())


    jso()

def execute(words):
    words_sep=seperate(words)
    words=Lemmatiz(words_sep)
    fin(words)
    words=[]


def store():
    import csv
    with open('wordstest.csv', 'w') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in wordle.items():
               writer.writerow([key, value])

def jso():
    import json

    json.dump(wordle, open("wordlist.txt",'w'))


def main():
    urls = [
        'http://www.stackoverflow.com/questions/5331266/python-easiest-way-to-scrape-text-from-list-of-urls-using-beautifulsoup',
        'http://stackoverflow.com/questions/5330248/how-to-rewrite-a-recursive-function-to-use-a-loop-instead',
        'https://www.geeksforgeeks.org/programming-language-choose/'
    ]
    txt = [getPageText(url) for url in urls]
    words=genwrd(txt)
    execute(words)

if __name__=="__main__":
    main()