selenium采集1688简单信息

1688信息采集

首先进入1688可以看见多种类别信息
类别
我们选择其中的一种,例如选择女装之后,又会有很多的类别
类别
将类别保存到列表,方便下面继续运行

detail_urls = []

categories = driver.find_elements_by_xpath("//div[@class='ch-menu-item']/div[@class='ch-menu-item-list']/ul[@class='fd-clr']/li[@class='item']/a")
for category in categories:
    url = category.get_attribute('href')
    detail_urls.append(url)

xpath获取所有类别的url
xpath获取
遍历进入,进入后数据是Ajax动态的方式加载
我们可以模拟鼠标下滑,一直加载,达到页面底部的时候退出循环
示例代码如下:

t = True
while t:
    check_height = driver.execute_script("return document.body.scrollHeight;")

    js = "var q=document.documentElement.scrollTop=20000"  # documentElement表示获取根节点元素
    driver.execute_script(js) # 执行js
    time.sleep(3.0)  # 加载数据不能过快,否则不能下滑

    check_height1 = driver.execute_script("return document.body.scrollHeight;")
    if check_height == check_height1:
        t = False

然后再利用xpath获取信息

源代码

# -*- coding:utf-8 -*-

from selenium import webdriver
import time
import csv

detail_urls = []
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')	# 配置无头浏览器
chrome_options.add_argument('--disable-gpu')
driver_path = r"D:\python3.7\Chromedriver\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path,options=chrome_options)

# https://home.1688.com/?spm=a260k.dacugeneral.1998214976.14.6633436ciDKU2e 家居百货
driver.get('https://home.1688.com/?spm=a260k.dacugeneral.1998214976.14.6633436ciDKU2e')

categories = driver.find_elements_by_xpath("//div[@class='ch-menu-item']/div[@class='ch-menu-item-list']/ul[@class='fd-clr']/li[@class='item']/a")

keys = {'url', 'title', 'price', 'volumes'}
with open("Home_Depot.csv",'a',encoding='utf-8-sig',newline='')as fp:
    writer = csv.DictWriter(fp,keys)
    writer.writeheader() #把表头写入进去时 需调用该方法

# 写入csv文件
def write_csv( url, title, price, volumes):
        keys = {'url', 'title', 'price', 'volumes'}
        values = [
            {'url': url, 'title': title, 'price': price,'volumes':volumes}
        ]
        with open("Home_Depot.csv", 'a', encoding='utf-8-sig', newline='') as fp:
            writer = csv.DictWriter(fp, keys)
            writer.writerows(values)

for category in categories:
    url = category.get_attribute('href')
    detail_urls.append(url)

for url in detail_urls:
    driver.get(url)
    time.sleep(1.0)
    t = True
    while t:
        check_height = driver.execute_script("return document.body.scrollHeight;")

        js = "var q=document.documentElement.scrollTop=20000"  # documentElement表示获取根节点元素
        driver.execute_script(js)
        time.sleep(3.0)

        check_height1 = driver.execute_script("return document.body.scrollHeight;")
        if check_height == check_height1:
            t = False

    hrefs = driver.find_elements_by_xpath("//div[@class='list']/div/a")
    titles = driver.find_elements_by_xpath("//div[@class='title-container']/div[@class='offer-title']")
    prices = driver.find_elements_by_xpath("//span[@class='price-num']/span[@class='alife-bc-uc-number']")
    volumes = driver.find_elements_by_xpath("//div[@class='list']/div/a/div[@class='clearfix']/div[1]")
    for index,href in enumerate(hrefs):
        # print(href.get_attribute('href'),volumes[index].get_attribute('textContent'),
        #       prices[index].get_attribute('textContent'),titles[index].get_attribute('textContent'))
        write_csv(href.get_attribute('href'),titles[index].get_attribute('textContent'),
        prices[index].get_attribute('textContent'),volumes[index].get_attribute('textContent'))
    print("该%s已经获取完全"%url)

最后得到的数据如下
数据

最后得到的信息有重复数据
利用pandas筛选

# -*- coding:utf-8 -*-

import pandas as pd

csvframe = pd.read_csv(r'E:\python\self\1688\Ornament.csv', encoding = "utf-8")
dup = csvframe.drop_duplicates(subset='url')   # 筛选重复的列
dup.to_csv('Ornament.csv',index=False ,encoding='utf_8_sig')

得到的数据就没有重复的了

©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页