xpath爬取虎扑排行榜
import urllib, urllib.request, re
import pymongo
import requests
from requests import RequestException
from lxml import etree
client = pymongo.MongoClient('localhost', 27017)
db = client["shoes"]
dblist = client.list_database_names()
col = db['shoes']
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '__mta=247222843.1583412681923.1583413132794.1583413232162.3; uuid_n_v=v1; uuid=0ABC5A305EE111EAB4D88BE885FBEA2366FFF5830AE14967928D6733384E1EB9; _csrf=12e26cdddb7d1e0529892a5d883f0d6b0aaf8a52a5a253aef10e4f2c0a06a33e; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1583412681; _lxsdk_cuid=170aac0197dc8-0fa89683372aa4-4313f6b-1fa400-170aac0197dc8; _lxsdk=0ABC5A305EE111EAB4D88BE885FBEA2366FFF5830AE14967928D6733384E1EB9; mojo-uuid=a745e1380d9a9aa555d41243bc5d338d; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1583413232; _lxsdk_s=170aaeff84f-3e7-519-675%7C%7C1'}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = etree.HTML(html)
achieves = pattern.xpath('//div[@class="box last"]//ul//li')
for achieve in achieves:
ranking = achieve.xpath('./h3//text()')[0]
shoesname = achieve.xpath('./p/a/text()')[0]
Cranking = achieve.xpath('./h4//text()')[0]
imgurl = achieve.xpath('./div/div/p/a/img/@_src')[0]
col.insert_one({'排名': ranking, '鞋名': shoesname, '综合排名': Cranking, '图片链接': imgurl})
def main():
url = 'http://zb.hupu.com/rank'
html = get_one_page(url)
achieveInfos = parse_one_page(html)
if __name__ == '__main__':
main()