# 搜索京东商品 使用动态数据抓取 import requests from bs4 import BeautifulSoup import re class JD: def __init__(self): self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} ##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦) def req(self, url): # 解析京东搜索首页地址 soup1 = self.requests_utf(url) allcount = soup1.find('span', id='J_resCount').get_text() print('共有%s件商品' % allcount) print('**************************************************************************') # 查询搜索的商品总页数 page = int(soup1.find('span', class_='fp-text').i.get_text()) for i in range(1, page * 2, 2): url_star = url[:-2] + str(i) # 根据商品页数解析搜索地址 soup = self.requests_utf(url_star) # 定位商品信息 li_all = soup.find_all('li', class_='gl-item') for i in li_all: # 商品标题 title = i.a['title'] # 商品实际地址 href = i.a['href'] if href[:4] == 'http': pass else: href = 'https:' + href # 价格 price = float(i.i.get_text()) # 解析商品实际地址 soup_href = self.requests_gbk(href) real_href = soup_href.find('link', rel="canonical")['href'] real_href = 'https:' + real_href # 定位商品名称并去空格 sku_name = soup_href.find('div', class_='sku-name') product_name = str(sku_name.get_text()).strip() # 产品Id search_ID = re.search('\d+', real_href) product_ID = search_ID.group() # 解析产品评价js,返回数据 summary = self.product_summary(product_ID) goodRateShow = ''.join(summary[0]) goodCountStr = ''.join(summary[1]) if price <= 10000: print('商品ID:', product_ID) print('商品:', product_name) print('商品标题:', title) print('好评累计:', goodCountStr) print('好评率:', goodRateShow + '%') print('链接地址:', real_href) print('价格:', price) print('*************************************************************************') else: continue # print('可用优惠券:',quan) # print('优惠券信息:',quan_item) # 产品评价 def product_summary(self, product_ID): url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv5292&productId=' + product_ID + '&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1' pro_req = self.requests_gbk(url) # 好评率 pattern0 = re.compile('goodRateShow":(.*?),"poorRateShow') # 好评累计 pattern1 = re.compile('"goodCountStr":"(.*?)",') # 部分评价 pattern_ping0 = re.compile('("content":"(.*?){2}","creationTime)') search0 = re.findall(pattern0, str(pro_req)) search1 = re.findall(pattern1, str(pro_req)) return search0, search1 # 解析网页utf-8 def requests_utf(self, url): try: content = requests.get(url, headers=self.headers) content.encoding = 'utf-8' soup = BeautifulSoup(content.text, 'lxml') return soup except: print('网页解析发生错误!!!!') # 解析网页gbk def requests_gbk(self, url): try: content = requests.get(url, headers=self.headers) content.encoding = 'gbk' soup = BeautifulSoup(content.text, 'html5lib') return soup except: print('网页解析发生错误!!!!') jd = JD() search_word = '女士长款钱包' url = 'https://search.jd.com/Search?keyword=' + search_word + '&enc=utf-8' jd.req(url)
python爬虫-京东全网搜索
最新推荐文章于 2024-07-24 13:47:38 发布