import requests
import pymysql
import re
from fake_useragent import UserAgent
from lxml import etree
ua = UserAgent()
values = []
def spider(db, cursor,key,index):
headers = { ‘User-Agent’:ua.random}
url = “http://search.dangdang.com/?key={key}&act=input&page_index={index}”.format(key = key,index = index)
response = requests.get(url,headers = headers)
html = response.text
content = etree.HTML(html)
ul_list = content.xpath(‘//div[@id=“search_nature_rg”]/ul[@id=“component_59”]/li’)
for li in ul_list:
title = li.xpath('./a/@title')[0] # 商品名
price = li.xpath('.//p[@class="price"]/span[@class="price_n"]/text()') # 价格
price = ''.join(price).replace("¥","")
link = li.xpath('./a/@href')[0] # 链接
value = (title,price,link)
values.append(value)
#print(value)
def table_exists(cursor,table_name): #判断数据库中表是否存在,不存在则新建
findsql = “show tables;”
cursor.execute(findsql)
tables = [cursor.fetchall()]
table_list = re.findall(‘(’.*?‘)’,str(tables))
table_list = [re.sub(“'”,‘’,each) for each in table_list]
if table_name in table_list:
print(‘表已存在不需要创建’)
else:
sql = “”“CREATE TABLE IF NOT EXISTS {}
(
title
VARCHAR(2000),
price
VARCHAR (10),
link
VARCHAR(2000)
)”“” .format(table_name)
cursor.execute("drop table if exists {}".format(table_name))
cursor.execute(sql) # 创建表
print("创建成功")
def insertData(db, cursor):
for item in range(1, 30):
spider(db, cursor,“手机”, item) # 搜索需要爬取的物品
for i in values:
idtext = i[0]
nametext = i[1]
linktext = i[2]
sql = "INSERT INTO %s (title, price, link)VALUES ('%s', '%s', '%s')" %(table_name,idtext, nametext, linktext)
try:
cursor.execute(sql) # 执行sql语句
db.commit() # 提交到数据库执行
print("成功写入数据" + idtext)
except:
db.rollback() # 发生错误时回滚
print("写入失败" + sql)
if name == ‘main’:
db = pymysql.connect(host=‘localhost’, port=3306, user=‘root’, passwd=‘123456’, db=‘students’, charset=‘utf8’)
cursor = db.cursor()
table_name = 'dangdangwang' # 数据库表名
table_exists(cursor,table_name)
insertData(db, cursor)
cursor.close()
db.close()