python 爬取京东 商品数据

sqlite_utils.py

import sqlite3

# 初始化数据库
def init_connection():
    connect = sqlite3.connect('jd.db')
    connect.execute('''CREATE TABLE product(
           id INTEGER PRIMARY KEY AUTOINCREMENT     NOT NULL,
           p_name           TEXT    NOT NULL,
           p_price            INT     NOT NULL,
           p_shop        CHAR(200)
           );''')


def get_connection():
    connect = sqlite3.connect('jd.db')
    return connect


# 插入一条数据
def insert_product(connect, p_name, p_price, p_shop):
    sql = "INSERT INTO product (p_name, p_price, p_shop) VALUES ('" + p_name + "'," + p_price + ",'" + p_shop + "')"
    connect.execute(sql)
    connect.commit()


if __name__ == '__main__':
    init_connection()
    connection = get_connection()
    # connection.execute("INSERT INTO product (p_name, p_price, p_shop) VALUES ('hello',1,'world')")
    # connection.commit()
    connection.close()

spider.py

import requests
from bs4 import BeautifulSoup
import sqlite_utils

def start():
    # 用来存储数据的sqlite
    connection = sqlite_utils.get_connection()
    base_url = "https://search.jd.com/Search?keyword=笔记本散热"

    # GET类型请求(不允许重定向,以此来判断是否到最后一页)
    get_result = requests.get(base_url, allow_redirects=False)
    page_num = -1
    # 爬取所有的页
    for i in range(1, 100):
        page_num = page_num + 2
        print("总页数:100   当前爬取页:" + str(page_num))
        url = base_url + "&page=" + str(page_num)
        result = requests.get(url, allow_redirects=False)
        # 指定html解析器,以适应在不同环境下运行
        soup1 = BeautifulSoup(result.text, "html.parser")
        scenery_list = soup1.find_all(name="div", attrs={"class": "gl-i-wrap"})
        for scenery in scenery_list:
            text = str(scenery)
            soup1 = BeautifulSoup(text, "html.parser")

            product_name = soup1.find(name="div", attrs={"class", "p-name p-name-type-2"})
            if product_name is None:
                continue
            product_name_str = product_name.find(name="em").text
            product_price = soup1.find(name="div", attrs={"class", "p-price"})
            if product_price is None:
                continue
            product_price_str = product_price.find(name="i").text
            product_shop = soup1.find(name="span", attrs={"class", "J_im_icon"})
            if product_shop is None:
                continue
            product_shop_str = product_shop.find(name="a").text
            sqlite_utils.insert_product(connection, product_name_str, product_price_str, product_shop_str)
    # 关闭文件
    connection.close()


if __name__ == '__main__':
    try:
        start()
    except OSError:
        pass
  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值