爬虫好久没弄了,今天正好是杨老师工作报道的第一周,领导突然要求准备所有工程院和科学院院士的信息。幸好只需要一段简介,这里爬虫技术就派上用场了。
先在网上找了一圈,发现工程院有源代码。爬完以后换了科学院的网址,发现正则不好用,爬信息的时候Html有好多种类,一个正则无法覆盖全部的网页。好在Xpath的结构是一样的,因此在原工程院的代码基础上进行改进,使用多线程加速,最终快速高效的爬到了所有科学院院士的信息。下面直接Show you code。
import concurrent.futures
import os
import re
import socket
from urllib.error import URLError, HTTPError
from urllib.request import urlopen, Request
from lxml import etree
def scraping(url):
print(f'Scraping {url} ...')
try:
req = Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})
with urlopen(req, timeout=10) as response:
html = response.read().decode('utf-8')
except (URLError, HTTPError, ConnectionResetError, socket.timeout) as e:
print(f'Error: {e}')
return None
# 从html中抓取个人简介和姓名
tree = etree.HTML(html)
try:
name = tree.xpath('//p[@class="wztitle"]/text()')[0]
except:
print(f'Error: Failed to extract name from {url}.')
return None
try:
sci = ''.join(tree.xpath('//div[@class="acadTxt"]//text()'))
except:
print(f'Error: Failed to extract personal profile from {url}.')
return None
path = os.path.join('YuanShi_Sci', f'{name}.txt')
with open(path, 'w', encoding='utf-8') as f:
f.write(sci)
return sci
if __name__ == '__main__':
# 将起始url加入列表中,去重并排序方便后续处理
startUrl = r'http://casad.cas.cn/ysxx2022/ysmd/qtys/'
with urlopen(startUrl) as fp:
content = fp.read().decode()
# 提取并遍历每位大牛链接
pattern = r'<a href="http://casad.cas.cn/ysxx2022/ysmd/(.+)" target="_blank">(.+)</a>'
result = re.findall(pattern, content)
urls = ['http://casad.cas.cn/ysxx2022/ysmd/' + item[0] for item in result]
# 使用多线程获取个人简介,并保存txt文件
datas = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = {executor.submit(scraping, url): url for url in urls}
for future in concurrent.futures.as_completed(futures):
url = futures[future]
try:
data = future.result()
datas.append(data)
except Exception as exc:
print(f'{url} generated an exception: {exc}')
print(f'Total {len(datas)} personals have been collected.')