Ebay网站商家信息抓取爬虫-多进程
使用selenium库进行ebay商家信息抓取,抓取结果保存到csv文件
# coding:utf-8
"""
ebay网站商家信息抓取
2020.11.10
"""
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from multiprocessing import Process
class Ebay():
def __init__(self, start, end):
self.csv_file_name = 'women_leggings.csv'
self.start = start
self.end = end
self.copy_url = 'https://www.ebay.fr/sch/i.html?_from=R40&_nkw=women+leggings&_sacat=0&LH_TitleDesc=0&_sop=18&LH_SellerWithStore=1&_fsrp=1&LH_SellerType=2&_fss=1&LH_Sold=1&_oaa=1&_dcat=11511&rt=nc'
self.url = self.copy_url + '&_ipg=192&_pgn='
self.option = Options()
desired_capabilities = DesiredCapabilities.CHROME # 修改页面加载策略
desired_capabilities["pageLoadStrategy"] = "eager" # 注释这两行会导致最后输出结果的延迟,即等待页面加载完成再输出
self.option.add_argument('--disable-gpu')
self.option.add_argument('keep-alive=False')
self.option.add_argument('--no-sandbox')
self.option.add_argument('--disable-plugins')
self.option.add_argument('--disable-javascript')
self.option.add_argument('--disable-images')
# self.option.add_argument('--headless')
self.option.add_argument('--ignore-certificate-errors')
self.option.add_argument('--ignore-ssl-errors')
self.option.add_argument("blink-settings=imagesEnabled=false")
self.driver = webdriver.Chrome(executable_path='../chromedriver/chromedriver.exe', chrome_options=self.option, keep_alive=False, desired_capabilities=desired_capabilities)
self.wait = WebDriverWait(self.driver, 30)
self.file_name = 'ebay_data_url.csv'
self.driver.minimize_window()
def next_page(self, page):
"""
翻页控制函数
:return: 页面源码
"""
self.driver.get(self.url + str(page))
time.sleep(3)
return self.driver.page_source
def get_url_list(self):
"""
获取所有橱窗函数 53页 进行翻页和解析存储
:return:
"""
with open('ebay_data_url.csv', 'w', encoding='utf-8', newline='') as fp:
fp.close()
f = open('ebay_data_url.csv', 'a', encoding='utf-8', newline='')
for page in range(54)