【Pyhton爬虫实战】爬取京东商城的商品信息

爬取京东商城的商品信息,并将数据以Dataframe形式展示

from selenium.webdriver import Chrome, ChromeOptions
import re
import pymysql
from selenium.webdriver.common.by import By
import collections
import pandas as pd

class Spider():
    def __init__(self):
        self.browser = None
        self.info_diclist = list()

    def onepage_info_by_selenium(self, browser, url):
        """获取详情页数据"""
        # browser.get("https://item.jd.com/100098751450.html")
        browser.get(url)
        grand = browser.find_element(by=By.CSS_SELECTOR, value="ul[id='parameter-brand'][class='p-parameter-list']")
        temp_dic = dict()
        # 品牌单独获取
        temp = re.split(r":\s{0,}", grand.text)
        temp_dic[temp[0]] = temp[1]
        info = browser.find_element(by=By.CSS_SELECTOR, value="ul[class='parameter2 p-parameter-list']")
        temp = re.findall(r"\S+:\s{0,}\S+", info.text)
        for i in temp:
            j = re.split(r":\s{0,}", i)
            temp_dic[j[0]] = j[1]
        # 获取好评差评
        # browser.find_element(by=By.CSS_SELECTOR, value="[data-tab='trigger'][data-anchor='#comment']").click()
        self.info_diclist.append(temp_dic)
        # input("end: ")

    def getmain_by_selenium(self):
        """使用selenium操作主页并写入cookie"""
        browser = Chrome()
        browser.get('https://www.jd.com')
        cookies = self.get_cookie()
        for cookie in cookies:
            browser.add_cookie(cookie)
        return browser

    def get_search_result(self, browser, key) -> list:
        """将查询到的所有商品连接保存到列表"""
        inputbar = browser.find_element(by=By.ID, value='key')
        inputbar.send_keys(key)
        browser.find_element(by=By.CSS_SELECTOR, value="[class='button'][aria-label='搜索']").click()
        browser.implicitly_wait(5)
        hrefs = browser.find_elements(by=By.CSS_SELECTOR, value="div[class='p-img'] a[target='_blank']")
        return [href.get_attribute(name='href') for href in hrefs]


    def get_info(self):
        """获取详细信息"""
        browser = self.getmain_by_selenium()
        results = self.get_search_result(browser, "投影仪")
        for url in results[:3]:
            self.onepage_info_by_selenium(browser, url=url)
        self.show_as_dataframe()
        input("end: ")

    def show_as_dataframe(self):
        """将字典展示为Dataframe"""
        data = pd.DataFrame(self.info_diclist)
        print(data)

    def get_cookie(self):
        """从数据库中获取cookie"""
        db = pymysql.connect(
            host='127.0.0.1',
            user='root',
            password='123456',
            charset='utf8',
            database='draft',
            port=3306
        )
        cursor = db.cursor()
        cursor.execute(query="SELECT COOKIE FROM COOKIES WHERE WEB_NAME='jingdong'")
        cookie = eval(cursor.fetchall()[0][0])
        cursor.close()
        db.close()
        return cookie


def main():
    Spider().get_info()


if __name__ == '__main__':
    main()

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值