import csv
import random
import re
import time
import logging
from datetime import datetime
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
class Spider(object):
def __init__(self):
self.chromeOptions = self.get_profile()
self.browser = self.get_browser()
self.wait = self.get_wait()
self.log = self.get_log()
#日志
def get_log(self):
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s') # 日志时间、执行程序路径、日志当前行号、日志级别、日志信息
sh = logging.StreamHandler()
sh.setFormatter(formatter) # 设置屏幕上显示的格式
today = datetime.now()
log_file_path = "./-{}-{}-{}-{}.log".format(ky,today.year, today.month, today.day)
handler = logging.FileHandler(log_file_path, encoding='utf-8')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.addHandler(sh)
return logger
def main(self):
self.csv_head()
self.parse_website(ky,start_page,end_page)
self.browser.quit() #浏览器关闭
def get_profile(self):
ua = [ # Opera
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", # Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10 Gecko / 20100922Ubuntu / 10.10(maverick)Firefox / 3.6.10",
# Safari
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
# chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
# 360
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", # 淘宝浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
# 猎豹浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
# QQ浏览器
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", # sogou浏览器
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
# maxthon浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
# UC浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]
# 谷歌相关设置
chromeOptions = webdriver.ChromeOptions()
# chromeOptions.add_argument('--headless') # 谷歌无头模式
chromeOptions.add_argument('--disable-gpu') # 禁用显卡
chromeOptions.add_argument('window-size=1280,800') # 指定浏览器分辨率
chromeOptions.add_argument("--no-sandbox")
prefs = {"profile.managed_default_content_settings.images": 2}
chromeOptions.add_experimental_option("prefs", prefs)
chromeOptions.add_argument('user-agent=' + random.choice(ua))
return chromeOptions
def get_browser(self):
browser = webdriver.Chrome(chrome_options=self.chromeOptions)
return browser
def get_wait(self):
wait = WebDriverWait(self.browser, 3)
return wait
def parse_website(self,ky,start_page,end_page):
self.browser.get("https://www.jd.com")# 访问京东网站
#等待直到局部元素显示出来,这里的局部元素为淘宝网页搜索框部分
input = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#key")))
input.send_keys(ky) #在输入框调用send_keys方法模拟输入关键字
#等待直到元素可被点击,这里的元素为搜索按钮
submit = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.button')))
submit.send_keys(Keys.ENTER) #模拟点击搜索按钮操作
#翻页爬取商品
for i in range(start_page,end_page+1):
time.sleep(1)
#等待直到局部元素显示出来,这里的局部元素为到第[2]页中的[..]
input = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input")))
input.clear() #清除当前输入框中的内容
input.send_keys(i) #把下一页的页码传入输入框中
#等待直到元素可被点击,这里的元素为输入页码后的的确定按钮
submit = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
submit.send_keys(Keys.ENTER) #模拟点击确定按钮
time.sleep(3)
self.browser.refresh() #刷新网页
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)") # 滑动至浏览器底部
# 一页显示60个商品,确保60个商品都正常加载出来。
try:
self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")))
except:
pass
html = self.browser.page_source # 页面渲染完毕后生成html
time.sleep(1)
print('正在解析第%d页数据'%i)
parseHtml = etree.HTML(html)
li_list = parseHtml.xpath('//*[@class="gl-item"]')
count = 0
for item in li_list:
count += 1
print('正在解析第%d个资源'%count)
try:
id = item.xpath('./@data-sku')[0]
except:
id = None
try:
img_url = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/img/@data-lazy-img')[0]
img_url = f'https:{img_url}'
except:
img_url = None
try:
detail_url = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/@href')[0]
if detail_url[:4] != 'http':
detail_url = f'https:{detail_url}'
except:
detail_url = None
try:
price = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()')[0]
except:
price = None
try:
title = ''.join(item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/em//text()'))
except:
title = None
if '京东超市' == title[:4]:
verify = True
else:
verify = False
try:
comment_num = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-commit"]/strong/a/text()')[0]
except:
comment_num = None
try:
shop_url = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-shop"]/span/a/@href')[0]
shop_url = 'https:'+shop_url
except:
shop_url = None
try:
shop_name = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-shop"]/span/a/text()')[0]
except:
shop_name = None
info = [title,id,price,verify,comment_num,shop_name,shop_url,img_url,detail_url]
self.save(info)
#csv表格商品头
def csv_head(self):
head = ['标题','商品ID','价格','是否为京东超市','评论数','店铺名称','店铺url','商品图片url','商品url']
csvFile = open(fr'{ky}.csv', 'a+', newline='', encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile)
writer.writerow(head)
csvFile.close()
#存储本地csv
def save(self,info):
csvFile = open(fr'{ky}.csv', 'a+', newline='', encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile)
writer.writerow(info)
csvFile.close()
if __name__ == "__main__":
ky = input('请输入爬取商品信息:')
start = time.time()
start_page = int(input('请输入要爬取的起始页(1-80):'))
end_page = int(input('请输入要爬取的结束页(1-80):'))
testspider = Spider()
testspider.main()
end = time.time()
self.log.info(end - start)
self.log.info('商品爬取完毕,谢谢使用!')
京东商品抓取
最新推荐文章于 2024-03-22 10:41:03 发布