import csv
import requests
from lxml import etree
from pymongo import MongoClient
class Pa_jd():
def __init__(self):
self.keyword = str(input('输入搜索内容'))
self.num = int(input('爬取几页?'))
self.list1 = []
self.host = '127.0.0.1'
self.port = 27017
def run(self):
'''爬取网页内容'''
headers = {
'Referer': 'https://item.jd.com/',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
for i in range(1, self.num):
page = i + (i - 1)
params = {
'keyword': self.keyword,
'page': page,
's': (page - 1) * 30 - 4,
'click': 0
}
url = 'https://search.jd.com/Search'
r = requests.get(url, headers=headers, params=params)
print(r.headers) # 响应头
print(r.url) # url
with open('jd搜索%s%s.html' % (self.keyword, i), 'w', encoding='utf-8') as f:
f.write(r.content.decode())
def fetch(self):
'''提取出有用的数据,放到list1里'''
for i in range(1, self.num):
with open('jd搜索%s%s.html' % (self.keyword, i), 'r', encoding='utf-8') as f:
text = f.read()
html = etree.HTML(text)
price_list = html.xpath('//li/div/div[@class="p-price"]/strong/i/text()')
title_list = html.xpath('//li/div/div[@class="p-name p-name-type-2"]/a/em/text()')
href_list = html.xpath('//li/div/div[@class="p-name p-name-type-2"]/a/@href')
for price in price_list:
item = dict()
item["price"] = price
item["title"] = title_list[price_list.index(price)]
item["href"] = href_list[price_list.index(price)]
self.list1.append(item)
def csv(self):
'''存储为cvs格式文件'''
with open('jd手机搜索%s.csv' % self.keyword, 'w', encoding='utf-8') as f:
# 创建一个csv的DictWriter对象,这样才能够将写入csv格式数据到这个文件
f_csv = csv.DictWriter(f, ['title', 'price', 'href'])
f_csv.writeheader()
f_csv.writerows(self.list1)
def mongo_db(self):
'''连接数据库,插入数据'''
client = MongoClient(self.host, self.port)
collection = client['jd']['jd1']
# insert_many接收一个列表,列表中为所有需要插入的字典
t = collection.insert_many(self.list1)
p = Pa_jd()
p.run() # 爬取
p.fetch() # 提取
p.mongo_db() # 存数据库
# p.cvs()
爬着玩…
params 是地址栏?后面的参数, s要有,page也要有,他的page不是1,2,3,4…
而是 1,3,5,7,9… 所以计算一下 page = i+(i-1) i=1,自增
s = (page - 1) * 30 - 4