ps:初次接触爬虫,故参考了球友的代码 作者:tipire https://www.jianshu.com/p/8537178639a8
from pyquery import PyQuery as pq
import requests
import threadpool
def download_html(word):
output = {'Word': word}
final_output = {}
url = 'http://dict.youdao.com/w/eng/{}/'.format(word)
try:
r = requests.get(url)
if r.status_code == 200:
doc = pq(r.text)
final_output = decode_html(doc, output)
print(final_output)
except Exception as e:
print('抓取页面异常,抓取不到:' + word)
return None
return final_output
def decode_html(doc, output):
output['Proc'] = ''
output['Desc'] = ''
for pro in doc.items('.baav .pronounce'):
output['Proc'] = output['Proc'] + pro.text()
for li in doc.items('#phrsListTab .trans-container ul li'):
output['Desc'] = output['Desc'] + li.text()
return output
word_list = ['spring', 'duck', 'python', 'beautiful', 'caption']
pool = threadpool.ThreadPool(10)
word_pool = threadpool.makeRequests(download_html, word_list)
[pool.putRequest(req) for req in word_pool]
pool.wait()
{'Word': 'caption', 'Proc': "英 ['kæpʃ(ə)n]美 ['kæpʃən]", 'Desc': 'n. 标题;字幕;说明;逮捕vt. 加上说明;加上标题'}
{'Word': 'python', 'Proc': "['paɪθɑn]", 'Desc': 'n. 巨蟒;大蟒n. (法)皮东(人名)'}
{'Word': 'duck', 'Proc': '英 [dʌk]美 [dʌk]', 'Desc': 'n. 鸭子;鸭肉;(英)宝贝儿;零分vi. 闪避;没入水中vt. 躲避;猛按…入水n. (Duck)人名;(德、葡、匈)杜克'}
{'Word': 'beautiful', 'Proc': "英 [ˈbju:tɪfl]美 ['bjʊtəfəl]", 'Desc': 'adj. 美丽的出色地出色的迷人的迷人地'}
{'Word': 'spring', 'Proc': '英 [sprɪŋ]美 [sprɪŋ]', 'Desc': 'n. 春天;弹簧;泉水;活力;跳跃adj. 春天的vi. 生长;涌出;跃出;裂开vt. 使跳起;使爆炸;突然提出;使弹开n. (Spring)人名;(德)施普林;(英、芬、瑞典)斯普林'}
[Finished in 0.5s]