-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
81 lines (71 loc) · 2.8 KB
/
main.py
File metadata and controls
81 lines (71 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy
import pandas
import selenium
# import webdriver_manager
import webdriver_manager.chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tqdm.auto import tqdm
TIMEOUT = 10
TASKS = {
'bcur': {
'year': '202510',
'columns': ['排名', '学校名称', '标签', '省市', '类型'],
'xpathss': [
['./td[1]/div'],
['./td[2]/div/div[2]/div[1]/div/div/span'],
['./td[2]/div/div[2]/p'],
['./td[3]'],
['./td[4]'],
]
},
'arwu': {
'year': '2024',
'columns': ['排名', '学校名称', '国家', '国家/地区排名'],
'xpathss': [
['./td[1]/div'],
['./td[2]/div/div[2]/div/span', './td[2]/div/div[2]/div[1]/div/div/span'],
['./td[3]'],
['./td[4]'],
]
},
}
def find_elements(row, xpaths):
for xpath in xpaths:
if (tmp1 := row.find_elements(By.XPATH, xpath)):
return tmp1[0].text
def main():
service = selenium.webdriver.ChromeService(
webdriver_manager.chrome.ChromeDriverManager(
# chrome_type=webdriver_manager.core.os_manager.ChromeType.BRAVE,
# driver_version='137.0.7151.61',
).install()
)
options = selenium.webdriver.ChromeOptions()
# options.binary_location = '/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'
options.add_argument('--headless')
driver = selenium.webdriver.Chrome(service=service, options=options)
for taskname, task in tqdm(TASKS.items()):
df = pandas.DataFrame(columns=task['columns'])
driver.get(f'https://www.shanghairanking.cn/rankings/{taskname}/{task["year"]}')
pages = int(WebDriverWait(driver, TIMEOUT).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="content-box"]/ul/li[8]/a'))
).text)
for page in tqdm(range(1, pages + 1)):
content_box = WebDriverWait(driver, TIMEOUT).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="content-box"]'))
)
for row in tqdm(content_box.find_elements(By.XPATH, './div[2]/table/tbody/tr')):
df.loc[len(df)] = [find_elements(row, xpaths) for xpaths in task['xpathss']]
driver.execute_script(
"arguments[0].click();",
WebDriverWait(content_box, TIMEOUT).until(
EC.element_to_be_clickable((By.XPATH, './ul/li[contains(@class, "ant-pagination-next")]/a'))
)
)
df.replace([numpy.nan, ''], '-', inplace=True)
df.to_csv(f'{taskname}{task["year"]}.csv', index=False, encoding='utf-8')
driver.quit()
if __name__ == '__main__':
main()