python3.7+win10
# -*- coding: utf-8 -*-
# @Time : 2022/6/27 16:55
# @Author : 刘浪
# @File : JD_spider.py
import requests
from lxml import etree
import xlwt
from spider_seting import * #请求头模块
import time
from selenium import webdriver
import random
#禁用警告
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class Excel:
# 当前行数
_current_row = 1
# 初始化,创建文件及写入title
def __init__(self, sheet_name='sheet1'):
# 表头,放到数组中
title_label = ['商品编号', '商品名称', '图片路径', '价格', '商家', '商品详情地址']
self.write_work = xlwt.Workbook(encoding='ascii')
self.write_sheet = self.write_work.add_sheet(sheet_name)
for item in range(len(title_label)):
self.write_sheet.write(0, item, label=title_label[item])
# 写入内容
def write_content(self, content):
for item in range(len(content)):
self.write_sheet.write(self._current_row, item, label=content[item])
# 插入完一条记录后,换行
self._current_row += 1
# 保存文件
def save_file(self, file_url='./dj_data.xls'):
try:
self.write_work.save(file_url)
print("文件保存成功!文件路径为:" + file_url)
except IOError:
print("文件保存失败!")
def get_html(url):
# 模拟浏览器访问
chrome_driver_path = "C:/Users/Administrator/Desktop/TEM/chromedriver.exe"
option = webdriver.ChromeOptions()
option.binary_location = "G:/360browser/360Chrome/Chrome/Application/360chrome.exe" # 这里是你指定浏览器的路径
option.add_argument('--no-sandbox')
#option.add_argument('--headless')#是否可视
option.add_argument('--disable-gpu')
option.add_experimental_option('excludeSwitches', ['enable-automation'])#不显示“正受到自动测试软件控制”
browser = webdriver.Chrome(executable_path=chrome_driver_path, chrome_options=option)
browser.get(url)
time.sleep(random.randint(1, 2))
#browser.maximize_window()#最大化显示
browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(random.randint(1, 3))
source = browser.page_source # 获取该网页源码
html = etree.HTML(source) # 解析网页
browser.quit()
return html
def save_data(search_url):
html = get_html(search_url)
# 商品列表
goods_list = html.xpath(r'.//li[@class="gl-item"]')
# 商品编号list
serial_num_list = html.xpath(r"//li[@class='gl-item']/@data-sku")
for li,serial_num in zip(goods_list,serial_num_list): # 遍历父节点
# 商品名称
name ="".join(li.xpath(r"div/div[@class='p-name p-name-type-2']/a/em/text()"))
# 图片路径
img_url ="https:"+"".join(li.xpath(r'.//div/div/a/img/@src')) if "".join(li.xpath(r'.//div/div/a/img/@data-lazy-img'))=="done" else "https:"+"".join(li.xpath(r'.//div/div/a/img/@data-lazy-img'))
# 价格
price = li.xpath(r".//div/div/strong/i/text()")
# 商家
shop = li.xpath(r".//div/span/a/text()")
# 商品详情地址
detail_addr = "https:"+"".join(li.xpath(r".//div/div[1]/a/@href"))
goods = [serial_num, name, img_url, price, shop, detail_addr]
excel.write_content(goods)
# 保存文件,使用的是相对目录(也可以使用绝对路径),会保存在当前文件的同目录下。文件名为dj_data.xls,必须是.xls后缀
excel.write_work.save("./dj_data.xls")
if __name__ == '__main__':
# 创建文件
excel = Excel()
# 搜索关键字
keyword =input("输入搜索的关键词=")
for page in range(1,200,2):
#搜索地址
search_url= 'https://search.jd.com/Search?keyword=' + keyword + "&page="+str(page)+'&enc=utf-8'
print(search_url)
time.sleep(2) #写入操作费时太快易报错
save_data(search_url)