python 爬京东商品信息

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from random import randint
from pyquery import PyQuery as pq
import pymongo


class JD(object):

    def __init__(self):
        self.options = webdriver.ChromeOptions()
        # 无痕模式
        self.options.add_argument("headless")
        self.browser = webdriver.Chrome(
            executable_path=r"C:\Users\dell\AppData\Local\Google\Chrome\Application\chromedriver.exe",
            chrome_options=self.options
        )
        # 浏览器最大化
        self.browser.maximize_window()
        self.keyword = "iphone"
        self.url = "https://www.jd.com/?cu=true&utm_source=baidu-search&utm_medium=cpc&utm_campaign=t_262767352_baidusearch&utm_term=106807362512_0_1ea216375c8242409e3b4487043f782b"

    def scroll(self):
        self.browser.execute_script(""" 
            (function () { 
                var y = document.body.scrollTop; 
                var step = 100; 
                window.scroll(0, y); 
                function f() { 
                    if (y < document.body.scrollHeight) { 
                        y += step; 
                        window.scroll(0, y); 
                        setTimeout(f, 50); 
                    }
                    else { 
                        window.scroll(0, y); 
                        document.title += "scroll-done"; 
                    } 
                } 
                setTimeout(f, 1000); 
            })(); 
            """)

    def skip(self):
        """
        主页面跳转到detail界面
        :return:
        """
        self.browser.get(self.url)
        wait = WebDriverWait(self.browser, 5)
        # 等待输入框出现
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#key')))
        # 等待搜索按钮出现
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".button")))
        time.sleep(2)
        # 清空出现的关键字
        input.clear()
        # 写入我们需要的关键字
        input.send_keys(self.keyword)
        time.sleep(2)
        # 模拟点击
        submit.click()
        # 跳转到商品信息界面
        time.sleep(randint(1, 3))
        # 页面下拉
        self.scroll()
        time.sleep(5)
        html = self.browser.page_source
        doc = pq(html)
        # 获取商品的总页数
        count = doc(".p-skip b").text()
        return int(count)

    def getData(self):
        self.browser.get(
            "https://search.jd.com/Search?keyword=iphone&enc=utf-8&suggest=1.def.0.V16--12s0,20s0,38s0,97s0&wq=ip&pvid=652942ef6a96487c830988faa7a2d8e6")
        html = self.browser.page_source
        doc = pq(html)
        items = doc(".gl-item").items()
        for index, item in enumerate(items):
            product = {
                "href": "https:{}".format(str(item(".p-img a").attr("href"))),
                "title": str(item(".p-img a").attr("title")).replace("\n", " "),
                "price": item(".p-price").text(),
                "name": str(item(".p-name em").text()).replace("\n", " "),
                "commit": item(".p-commit").text(),
                "shop": item(".p-shop a").text() + " " + "https:" + item(".p-shop a").attr("href"),
                "icons": item(".p-icons").text(),
                "stock": item(".p-stock").text()
            }
            time.sleep(randint(1, 2))
            print(index, product)
            self.write(product)
            self.save_to_mongo(product)

    def next(self):
        wait = WebDriverWait(self.browser, 3)
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".pn-next")))
        time.sleep(3)
        submit.click()
        print(self.browser.current_url)
        time.sleep(2)
        self.scroll()

    def write(self, content):
        with open(r"G:\个人总结\csv\jd_iphone.csv", 'a+', encoding="utf-8") as file:
            file.write(str(content) + "\n")

    def save_to_mongo(self, result):
        MONGO_URL = "localhost"
        MONGO_DB = "JingDong"
        MONGO_COLLECTION = "products"
        client = pymongo.MongoClient(MONGO_URL)
        db = client[MONGO_DB]
        try:
            db[MONGO_COLLECTION].insert_one(result)
        except Exception:
            print("存储到MongoDB失败")

    def execute(self):
        count = self.skip()
        for i in range(count):
            print("第{}页".format(i + 1))
            self.getData()
            self.next()


if __name__ == "__main__":
    JD().execute()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值