import requests
from lxml import etree
from aikanbao.MysqlHelper import *
import json
#奇热妙招网
number = 0
def get(page):
global number
print('第'+str(page)+'页')
print('---------------------')
# 加上User-Agent代理
user_agent = {'User-Agent': '****'}
tiaoshu = 0
#数据库连接
db = pymysql.connect("*****", "***", "****", "***")
cursor = db.cursor()
url = 'http://www.qire9.com/yinshi/index_'+str(page)+'.html'
html = requests.get(url, headers=user_agent).content
ele = etree.HTML(html)
html_data = ele.xpath("//div[@class='mipui-widget-media-body mipui-category-list-001']/div[@class='mipui-category-list-item']")
for i in html_data:
title = i.xpath("div[@class='item-media']/a/@title")[0].strip()
detail_url = i.xpath("div[@class='item-media']/a/@href")[0].strip()
cover = i.xpath("div[@class='item-media']/a/mip-img/@src")[0].strip()
if cover=='/public/assets/default/images/no-images.jpg':
cover = ''
remark = i.xpath("div[@class='item-content']/p[@class='description']/text()")[0].strip()
created_at = i.xpath("div[@class='item-content']/p/span/text()")[0].strip()
created_at = created_at+' 00:00:00'
source_url = url
#查询是否存在
counSql = "select id as count from article WHERE title = '%s'" % (title)
cursor.execute(counSql)
find = cursor.fetchone()
if find:
print('--已经存过了')
continue
#详情页面
detail_response = requests.get(detail_url, headers=user_agent).text
detail = etree.HTML(detail_response)
p_list = detail.xpath("//section[@class='mip-box-body mipcms-detail-body']/p")
#算出有多少个p标签
p_count = 0
for c in p_list:
if c.xpath('text()') and c.xpath('text()')[0] =='猜你喜欢':
break
p_count +=1
#最后一个是猜你喜欢 去掉一个P标签
p_count = p_count-1
j = 0
jsonList = []
while j <= p_count:
table = detail.xpath("//section[@class='mip-box-body mipcms-detail-body']/p")[j]
detail_content = etree.tostring(table, encoding='utf-8', method='html')
detail_content = detail_content.decode('utf-8')
jsonList.append(detail_content)
j +=1
content = json.dumps(jsonList, ensure_ascii=False)
#插入数据库
sql = "INSERT INTO article(source_type,type, title, cover,remark,content,source_url,created_at) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s')" % \
(2, 3, title, cover, remark,content,source_url,created_at)
result = cursor.execute(sql)
db.commit()
if result:
number += 1
print(title+'-------------保存成功')
tiaoshu += 1
else:
print('-------------')
print(detail_url)
print(content)
print('插入失败')
exit()
print('-------------')
print('第'+str(page)+'页' + str(tiaoshu) + '条')
return number
page = 3
while page >0:
get(page)
page -= 1
print('总共'+str(number)+'条')