request简易爬虫——京东数据
import requests
import json
import re
from bs4 import BeautifulSoup
import pymysql
class SpiderJd:
def __init__(self):
self.header = {
'Content-Type': 'text/html; charset=UTF-8',
'Content-Encoding': 'gzip',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
}
def response_handle(self, base_url):
req = requests.get(base_url, headers=self.header)
return req
def parse(self, response):
soup = BeautifulSoup(response.text, 'lxml')
all_da00 = soup.find_all('ul', class_="gl-warp clearfix")[0]
all_data = all_da00.find_all('li')
return all_data
def save_data(self, data):
title = data.find_all('i', class_="promo-words")[0].text
img_base = "https://" + re.findall('-img="//(.*?\\.jpg)"', str(data))[0]
price = "¥" + data.select('strong > i')[0].text
privilege = [i.text for i in data.find_all('i', class_="goods-icons4 J-picon-tips")]
data_d = {'title': title, 'img': img_base, 'price': price, 'privilege': privilege}
with open('Jd.json', 'a+', encoding='utf-8') as f:
json.dump(data_d, f, ensure_ascii=False)
f.write('\n')
def main(self):
url = "https://list.jd.com/list.html?cat=16750,16755,16809"
resp = self.response_handle(url)
par = self.parse(resp)
for i in par:
self.save_data(i)
if __name__ == '__main__':
crawl = SpiderJd()
crawl.main()