1+ import urllib .request as request
2+ import ssl
3+ import bs4
4+
5+ # import sys
6+ # sys.setrecursionlimit(1000000)
7+
8+ def getData (src ):
9+
10+ context = ssl ._create_unverified_context ()
11+ req = request .Request (src , headers = {
12+ "User-Agent" :"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
13+ })
14+
15+ with request .urlopen (req ,context = context ) as response :
16+ data = response .read ().decode ("utf-8" )
17+
18+ root = bs4 .BeautifulSoup (data ,"html.parser" )
19+ td_tag = root .find_all ("td" )
20+ totalList = []
21+ for t in td_tag :
22+ totalList .append (t .string )
23+
24+ x = 2
25+ industryList2018 = []
26+ industryList2017 = []
27+ industryList2016 = []
28+ countryList2018 = []
29+ countryList2017 = []
30+ countryList2016 = []
31+
32+ while x < len (totalList ) :
33+ if "2018" in totalList [x ]:
34+ industryList2018 .append (totalList [x + 2 ])
35+ countryList2018 .append (totalList [x + 1 ])
36+ x += 6
37+ elif "2017" in totalList [x ]:
38+ industryList2017 .append (totalList [x + 2 ])
39+ countryList2017 .append (totalList [x + 1 ])
40+ x += 6
41+ elif "2016" in totalList [x ]:
42+ industryList2016 .append (totalList [x + 2 ])
43+ countryList2016 .append (totalList [x + 1 ])
44+ x += 6
45+ else :
46+ x += 6
47+ cLtotal = countryList2018 + countryList2017 + countryList2016
48+ return {
49+ "iL2018" :industryList2018 ,
50+ "iL2017" :industryList2017 ,
51+ "iL2016" :industryList2016 ,
52+ "cL2018" :countryList2018 ,
53+ "cL2017" :countryList2017 ,
54+ "cL2016" :countryList2016 ,
55+ "cLtotal" :cLtotal
56+ }
57+
58+ def clear_account (lists ):
59+ #去除重複的值
60+ wokey = {}
61+ wokey = wokey .fromkeys (lists )
62+
63+ word_1 = list (wokey .keys ())
64+ #統計元素的出現次數,存入字典
65+ for i in word_1 :
66+ wokey [i ]= lists .count (i )
67+ return wokey
68+
69+ def sort_1 (wokey ):
70+ wokey_1 = {}
71+ wokey_1 = sorted (wokey .items (), key = lambda d :d [1 ], reverse = True )
72+
73+ # sum=0
74+ # x=len(wokey_1)
75+ # for w in wokey_1[3:x+1]:
76+ # sum=sum+w[1]
77+ wokey_1 = wokey_1 [0 :3 ] # +[('Others',sum)]
78+ wokey_1 = dict (wokey_1 )
79+ return wokey_1
80+
81+
82+
83+ # Demo_main
84+
85+ src = "https://www.cbinsights.com/research-unicorn-companies"
86+ data = getData (src )
87+ a = sort_1 (clear_account (data ["iL2018" ]))
88+ b = sort_1 (clear_account (data ["iL2017" ]))
89+ c = sort_1 (clear_account (data ["iL2016" ]))
90+
91+
92+ # import plotly.plotly as py
93+ # import plotly.graph_objs as go
94+
95+ # def goBar(set, year):
96+ # labels=[]
97+ # values=[]
98+
99+ # for name in set:
100+ # labels.append(name)
101+ # values.append(set[name])
102+ # data = [go.Bar(
103+ # x=labels,
104+ # y=values
105+ # )]
106+
107+ # py.plot(data, filename=year+' Industry distribution',auto_open=True)
108+
109+
110+ # goBar(a,"2018")
111+ # goBar(b,"2017")
112+ # goBar(c,"2016")
113+
114+ # 產業類別
115+
116+ def getLV (set ):
117+ labels = []
118+ values = []
119+
120+ for name in set :
121+ labels .append (name )
122+ values .append (set [name ])
123+ print (labels )
124+ print (values )
125+
126+ getLV (a )
127+ getLV (b )
128+ getLV (c )
129+
130+ # 國家分佈
131+
132+ print (len (data ["cLtotal" ]))
133+ wokey = clear_account (data ["cLtotal" ])
134+
135+ wokey_1 = {}
136+ wokey_1 = sorted (wokey .items (), key = lambda d :d [1 ], reverse = True )
137+
138+ sum = 0
139+ x = len (wokey_1 )
140+ for w in wokey_1 [3 :x + 1 ]:
141+ sum = sum + w [1 ]
142+ wokey_1 = wokey_1 [0 :3 ]+ [('Others' ,sum )]
143+ wokey_1 = dict (wokey_1 )
144+
145+ getLV (wokey_1 )
0 commit comments