又做了一回爬JD信息的爬虫,但是这次爬取的内容更多更全,其实写代码本身不难,主要费时间的就是找相关信息的url,详细代码如下:
防水处理了一下url,base64加密的,解密可用如下函数:
import base64
def dec(string):
return base64.b64decode(string.encode()).decode()
完整代码如下:
import requests
import re
import time
import json
import os
import traceback
from lxml import etree
from collections import OrderedDict
def get_text(href):
try:
hds = {
'Referer': 'https://item.jd.com/1361956.html',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
resp = requests.get(href, headers=hds)
resp.encoding = resp.apparent_encoding
resp.raise_for_status()
return resp.text
except requests.RequestException:
return ''
def uniform_url(text):
if 'https' not in text:
text = 'https:' + text
return text
def get_urls(href):
text = get_text(href)
html = etree.HTML(text)
url_list = html.xpath('//div[@id="J_goodsList"]/ul/li//div[3]/a/@href')
url_list = map(uniform_url, url_list)
return url_list