Hack_cur/string_dispose.py at master · shanhx2000/Hack_cur · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

#with open (file) as f:
 #   for line in f:
  #      do something...

import os
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer


dictionary = {}
stopwords = ['、','（','）','，','。','：','“','”',
             '\n\u3000','\u3000','的','‘','’',
             'a','in','also','below','am','is','are','have',
             'the','of',',',' ','and','this','to','be',
             'that','it','was','by']


def process():
    #cur_dir = os.getcwd()
    #print(cur_dir)
    #subdir = os.listdir(cur_dir+'/data/wordcloud/')
    #print(subdir)

    #for f in subdir: # 遍历文件夹下的文件
        #dictionary = {}
        #uf = open(cur_dir+'/data/wordcloud/'+f, 'r',encoding='UTF-8',errors='ignore')

        #textori = open(cur_dir+'/data/wordcloud/'+f, 'r',encoding='UTF-8',errors='ignore').read() # 读取文本内容

    textori = input()
    #nam = f.split(".")

    #wr = open(cur_dir+'/data/rev_getAlpha/'+nam[0]+'_rev.txt', 'w', encoding='UTF-8',errors='ignore')
    output_wr = ""
    #delete unimportant character
    for line in textori:
        for word in line:
            if word.isalpha() or word == ' ':
                output_wr = output_wr + word
            elif word == '\n' or word == '_':
                output_wr = output_wr + ', '
            else:
                output_wr = output_wr + ' '

    #for word in output_wr:
    #    wr.write(word)
    #wr.close()

    #print(cur_dir+'/data/wordcloud/'+f)

    #text = open(cur_dir+'/data/rev_getAlpha/'+nam[0]+'_rev.txt', 'r',encoding='UTF-8',errors='ignore').read() # 读取文本内容
    text = output_wr

    fredist = nltk.FreqDist(text.split(' ')) # 获取单文件词频

    print(fredist)

    for localkey in fredist.keys(): # 所有词频合并。 如果存在词频相加，否则添加
        if localkey in stopwords: # 检查是否为停用词
         #   print('-->停用词：', localkey)
            continue
        if localkey in dictionary.keys(): # 检查当前词频是否在字典中存在
         #   print('--> 重复值：', localkey, dictionary[localkey]+fredist[localkey],fredist[localkey])
            dictionary[localkey] = dictionary[localkey] + fredist[localkey] # 如果存在，将词频累加，并更新字典值
        else: # 如果字典中不存在
            dictionary[localkey] = fredist[localkey] # 将当前词频添加到字典中
        #    print('--> 新增值：', localkey, dictionary[localkey])

    words = []
    for word in dictionary:
         tt = ()
         tmp_list = [word,dictionary[word]]

         tt = tuple(tmp_list)
         if word not in stopwords:
             words.append(tt)
    #print(words)

    tmp = sorted(words,key=lambda x:x[1],reverse=True)
    #print(tmp)
    output_str = ""
    for item in tmp:
        output_str = output_str + ' ' + item[0]
    print(output_str)
    #write_to_file(tmp,cur_dir+'/data/result/'+nam[0]+'_result.txt')
    #print(nam)

    '''
    '''
    #uf.close()
    '''
    '''
    #    print('===================================================')
     #   print(sorted(dictionary.items(), key = lambda  x:x[1])) # 根据词频字典值排序，并打印

def write_to_file(words, file='results.txt'):
    f = open(file, 'w')
    for item in words:
       # for field in item:
       f.write(str(item[0])+' ')
       f.write(str(item[1]))#+','
       f.write('\n')
    f.close()

process()