标签dd为一个item:
完整的dd标签
爬取的html
匹配排名re
封面
最终匹配
仿写,没有额外的包,可以直接运行
# -*- coding: utf-8 -*-
import re
from urllib import request
headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"}
url = 'https://search.51job.com/list/030200,000000,0000,32,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,2.html'
req = request.Request(url=url, headers=headers)
response = request.urlopen(req)
print(response.info())
html = response.read().decode('gbk')
pattern = re.compile('<div.*?>.*?<a.*?title="(.*?)".*?</a>.*?<a.*?>(.*?)</a>.*?<span.*?>(.*?)</span>.*?'
+'<span.*?>(.*?)</span>.*?<span.*?>(.*?)</span>.*?</div>', re.S)
content_list = re.findall(pattern, html)
for i in content_list[:-3]:
print(i)