Python京东爬虫

最新推荐文章于 2024-06-06 16:26:09 发布

卷儿哥

最新推荐文章于 2024-06-06 16:26:09 发布

阅读量5.5k

点赞数 1

分类专栏： Python 文章标签： python 爬虫 mysql

本文链接：https://blog.csdn.net/DahlinSky/article/details/104252919

版权

Python 专栏收录该内容

16 篇文章 1 订阅

订阅专栏

Python京东爬虫

这是我以前写的一个基于Chrome浏览器的京东爬虫，使用了selenium库和Chrome浏览器，实验性质的脚本，可以根据不同的商品名称，抓取京东商城上的商品明细列表，并存入MySQL数据库。

京东爬虫的Github项目地址

4. 执行SQL脚本创建数据库

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

DROP TABLE IF EXISTS `jdshop`;
CREATE TABLE `jdshop`  (
  `sid` int(11) NOT NULL AUTO_INCREMENT COMMENT '商品ID',
  `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '搜索词',
  `shop` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '店铺名称',
  `label` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '前置标签',
  `title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '商品标题',
  `advertising` varchar(1000) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '广告',
  `price` decimal(8, 2) NULL DEFAULT NULL COMMENT '价格',
  `pinggou` varchar(40) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '拼购价',
  `plus` decimal(8, 2) NULL DEFAULT NULL COMMENT '会员价',
  `comment` varchar(30) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '评论数',
  `tag` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '打标',
  `isale` int(11) NULL DEFAULT NULL COMMENT '销量排名',
  `ctime` datetime(0) NULL DEFAULT NULL,
  PRIMARY KEY (`sid`) USING BTREE,
  UNIQUE INDEX `sid_UNIQUE`(`sid`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 6001 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

5. 修改链接字符串和商品名称

# 待抓取的商品名称
KEYWORD = '手机'
# mysql 连接字符串
conn = pymysql.connect(host='192.168.72.128', port=3306, user='admin', passwd='XXXXXX', db='dahlindb')

6. 执行脚本源代码

import re
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pg
import json
import csv
from bs4 import BeautifulSoup
import pymysql

INDEX = 0
CREATE_TIME = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
KEYWORD = '手机'
URL = 'https://search.jd.com/Search?keyword={key1}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq={key2}&page={page}&s={amount}&psort=3&cid2=653&cid3=655&click=0'
MAX_PAGE = 100

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(browser, 10)

conn = pymysql.connect(host='192.168.72.128', port=3306, user='admin', passwd='XXXXX', db='dahlindb')
sql_insert = "INSERT INTO jdshop (keyword,shop,label,title,advertising,price,pinggou,plus,comment,tag,isale,ctime) " \
             "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"
cursor = conn.cursor()


def filter_title(thtml):
    """
    Extract tags and titles
    :param thtml:
    :return:
    """
    label, title = '', ''
    front_title = re.search('<span.*?>(.*?)</span>', thtml, re.S)
    if front_title and len(front_title.group()) > 1:
        label = front_title[1]
    result = re.search('<em>(.*?)</em>', thtml, re.S)
    temp = re.sub('<span.*?</span>', '', result[1])
    temp = re.sub('<font.*?>', ' ', temp)
    temp = re.sub('</font>', '', temp)
    title = re.sub('<img.*?/>', ' ', temp)
    return label, title


def index_page(index):
    """
    Grab page data based on index code
    :param page:index
    :return:
    """
    try:
        amount = 1
        page = 1
        if index > 1:
            amount = (index-1)*60+1
            page = index+2
        print("正在扒取第{page}页".format(page=index))
        url = URL.format(key1=quote(KEYWORD), key2=quote(KEYWORD), page=page, amount=amount)
        browser.get(url)
        for i in range(1, 5):
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '.gl-item'))
            )
        html = browser.page_source
        get_products(html)
    except TimeoutException as e:
        print(e)

def get_products(html):
    """
    Extract product detail data
    :return:
    """
    global INDEX
    doc = pg(html)
    items = doc('.gl-item').items()
    for item in items:
        pinggou = item.find('.price-pingou').text().replace('\n', ' ').strip('￥')
        INDEX = INDEX+1
        thtml = item.find('.p-name').html()
        label, title = filter_title(thtml)
        if item.find('.p-tag3').attr('src'):
            label = '京东精选'
        shop = item.find('.p-shop').text().strip('\n')
        advertising = item.find('.promo-words').text()
        plus = item.find('.price-plus-1').text().strip('￥')
        comment = item.find('.p-commit').text().replace('\n条评价', '').strip('二手有售').strip('\n')
        tag = item.find('.p-icons').text().replace('\n', '-')
        sprice = BeautifulSoup(item.find('.p-price').html(), 'lxml')
        price = sprice.i.string
        plus = plus if plus != '' else price
        if not (price.split('.')[0]).isdigit():
            price = "0"
            plus = "0"
        insert_db = (KEYWORD, shop, label, title, advertising, price, pinggou, plus, comment, tag, INDEX, CREATE_TIME)
        print(insert_db)
        try:
            effect_now = cursor.executemany(sql_insert, [insert_db, ])
            conn.commit()
        except Exception as e:
            print(e)
        print('The {} is running,return value is  {}! '.format(INDEX, cursor.lastrowid))

def main():
    """
    Traversing a hundred pages of data
    :return:
    """
    for i in range(1, MAX_PAGE+1):
        index_page(i)
        time.sleep(2)
    browser.close()
    cursor.close()
    conn.close()

if __name__ == '__main__':
    main()