sqlite_utils.py
import sqlite3
# 初始化数据库
def init_connection():
connect = sqlite3.connect('jd.db')
connect.execute('''CREATE TABLE product(
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
p_name TEXT NOT NULL,
p_price INT NOT NULL,
p_shop CHAR(200)
);''')
def get_connection():
connect = sqlite3.connect('jd.db')
return connect
# 插入一条数据
def insert_product(connect, p_name, p_price, p_shop):
sql = "INSERT INTO product (p_name, p_price, p_shop) VALUES ('" + p_name + "'," + p_price + ",'" + p_shop + "')"
connect.execute(sql)
connect.commit()
if __name__ == '__main__':
init_connection()
connection = get_connection()
# connection.execute("INSERT INTO product (p_name, p_price, p_shop) VALUES ('hello',1,'world')")
# connection.commit()
connection.close()
spider.py
import requests
from bs4 import BeautifulSoup
import sqlite_utils
def start():
# 用来存储数据的sqlite
connection = sqlite_utils.get_connection()
base_url = "https://search.jd.com/Search?keyword=笔记本散热"
# GET类型请求(不允许重定向,以此来判断是否到最后一页)
get_result = requests.get(base_url, allow_redirects=False)
page_num = -1
# 爬取所有的页
for i in range(1, 100):
page_num = page_num + 2
print("总页数:100 当前爬取页:" + str(page_num))
url = base_url + "&page=" + str(page_num)
result = requests.get(url, allow_redirects=False)
# 指定html解析器,以适应在不同环境下运行
soup1 = BeautifulSoup(result.text, "html.parser")
scenery_list = soup1.find_all(name="div", attrs={"class": "gl-i-wrap"})
for scenery in scenery_list:
text = str(scenery)
soup1 = BeautifulSoup(text, "html.parser")
product_name = soup1.find(name="div", attrs={"class", "p-name p-name-type-2"})
if product_name is None:
continue
product_name_str = product_name.find(name="em").text
product_price = soup1.find(name="div", attrs={"class", "p-price"})
if product_price is None:
continue
product_price_str = product_price.find(name="i").text
product_shop = soup1.find(name="span", attrs={"class", "J_im_icon"})
if product_shop is None:
continue
product_shop_str = product_shop.find(name="a").text
sqlite_utils.insert_product(connection, product_name_str, product_price_str, product_shop_str)
# 关闭文件
connection.close()
if __name__ == '__main__':
try:
start()
except OSError:
pass