import json
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
class VipSpider(object):
def __init__(self, url, search, start_page, end_page):
self.url = url
self.search = search
self.start_page = start_page
self.end_page = end_page
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36"}
self.driver = webdriver.Firefox()
def handle_click(self):
self.driver.get(self.url)
self.driver.find_elements_by_xpath("//*[@id='J_main_nav_link']/li[13]/a")[0].click()
sleep(2)
self.driver.find_elements_by_xpath("//*[@id='J-search']/div[1]/input")[0].send_keys(self.search)
sleep(2)
self.driver.find_elements_by_xpath("//*[@id='J-search']/div[1]/a/span")[0].click()
sleep(3)
def handle_url(self, page):
Durl = self.driver.current_url # "https://category.vip.com/suggest.php?keyword=%E7%AF%AE%E7%90%83&ff=235|12|1|1"
index = Durl.rfind("&")
Durl = Durl[:index]
data = {
"page": page
}
res = requests.get(url=Durl, params=data, headers=self.headers)
newurl = res.url
print(newurl)
return newurl
def scroll_page(self, req):
self.driver.get(req)
sleep(3)
for x in range(20):
js = "var q=document.documentElement.scrollTop=10000"
self.driver.execute_script(js) # 执行脚本(滚动)
sleep(5)
html = self.driver.page_source
return html
def download(self, request):
soup = BeautifulSoup(request, "lxml")
SectionList = soup.select("section#J_searchCatList")[0]
GoodsList = SectionList.select("div.c-goods")
items = []
for div in GoodsList:
item = {}
imageslink = div.img["data-original"]
title = div.select("h4.goods-info a")[0].get_text()
discount = div.select("div.goods-info span")[0].get_text()
pricewra = div.select("div.goods-info em")[0].get_text()
marprice = div.select("div.goods-info del.goods-market-price ")[0].get_text()
item["图片链接"] = imageslink
item["商品名称"] = title
item["商品折扣"] = discount
item["特卖价格"] = pricewra
item["原始价格"] = marprice
items.append(item)
return items
def startSpider(self):
htmlList = []
for page in range(int(self.start_page), int(self.end_page) + 1):
if page == 1:
self.handle_click()
req = self.handle_url(page)
newhtml = self.scroll_page(req)
htmlList += self.download(newhtml)
else:
req = self.handle_url(page)
newhtml = self.scroll_page(req)
htmlList += self.download(newhtml)
# 【数据的存储】写入json数据
# 将列表转化成json字符串
string = json.dumps(htmlList)
with open("vip2.json", "w", encoding="utf-8") as fp:
fp.write(string)
def main():
url = "http://www.vip.com/"
search = input("请输入你要搜索的商品:")
start_page = input("请输入你要爬取的起始页:")
end_page = input("请输入你要爬取的结束页:")
spider = VipSpider(url, search, start_page, end_page)
spider.startSpider()
if __name__ == '__main__':
main()
python爬虫(爬取唯品会)
最新推荐文章于 2024-07-29 10:35:11 发布