-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstring_dispose.py
More file actions
114 lines (91 loc) · 3.52 KB
/
string_dispose.py
File metadata and controls
114 lines (91 loc) · 3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
#with open (file) as f:
# for line in f:
# do something...
import os
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
dictionary = {}
stopwords = ['、','(',')',',','。',':','“','”',
'\n\u3000','\u3000','的','‘','’',
'a','in','also','below','am','is','are','have',
'the','of',',',' ','and','this','to','be',
'that','it','was','by']
def process():
#cur_dir = os.getcwd()
#print(cur_dir)
#subdir = os.listdir(cur_dir+'/data/wordcloud/')
#print(subdir)
#for f in subdir: # 遍历文件夹下的文件
#dictionary = {}
#uf = open(cur_dir+'/data/wordcloud/'+f, 'r',encoding='UTF-8',errors='ignore')
#textori = open(cur_dir+'/data/wordcloud/'+f, 'r',encoding='UTF-8',errors='ignore').read() # 读取文本内容
textori = input()
#nam = f.split(".")
#wr = open(cur_dir+'/data/rev_getAlpha/'+nam[0]+'_rev.txt', 'w', encoding='UTF-8',errors='ignore')
output_wr = ""
#delete unimportant character
for line in textori:
for word in line:
if word.isalpha() or word == ' ':
output_wr = output_wr + word
elif word == '\n' or word == '_':
output_wr = output_wr + ', '
else:
output_wr = output_wr + ' '
#for word in output_wr:
# wr.write(word)
#wr.close()
#print(cur_dir+'/data/wordcloud/'+f)
#text = open(cur_dir+'/data/rev_getAlpha/'+nam[0]+'_rev.txt', 'r',encoding='UTF-8',errors='ignore').read() # 读取文本内容
text = output_wr
fredist = nltk.FreqDist(text.split(' ')) # 获取单文件词频
print(fredist)
for localkey in fredist.keys(): # 所有词频合并。 如果存在词频相加,否则添加
if localkey in stopwords: # 检查是否为停用词
# print('-->停用词:', localkey)
continue
if localkey in dictionary.keys(): # 检查当前词频是否在字典中存在
# print('--> 重复值:', localkey, dictionary[localkey]+fredist[localkey],fredist[localkey])
dictionary[localkey] = dictionary[localkey] + fredist[localkey] # 如果存在,将词频累加,并更新字典值
else: # 如果字典中不存在
dictionary[localkey] = fredist[localkey] # 将当前词频添加到字典中
# print('--> 新增值:', localkey, dictionary[localkey])
words = []
for word in dictionary:
tt = ()
tmp_list = [word,dictionary[word]]
tt = tuple(tmp_list)
if word not in stopwords:
words.append(tt)
#print(words)
tmp = sorted(words,key=lambda x:x[1],reverse=True)
#print(tmp)
output_str = ""
for item in tmp:
output_str = output_str + ' ' + item[0]
print(output_str)
#write_to_file(tmp,cur_dir+'/data/result/'+nam[0]+'_result.txt')
#print(nam)
'''
'''
#uf.close()
'''
'''
# print('===================================================')
# print(sorted(dictionary.items(), key = lambda x:x[1])) # 根据词频字典值排序,并打印
def write_to_file(words, file='results.txt'):
f = open(file, 'w')
for item in words:
# for field in item:
f.write(str(item[0])+' ')
f.write(str(item[1]))#+','
f.write('\n')
f.close()
process()