Python/python/xpath爬虫--妙招网

最新推荐文章于 2024-04-23 23:55:37 发布

小鹏程序

最新推荐文章于 2024-04-23 23:55:37 发布

阅读量387

点赞数 1

分类专栏： Python

本文链接：https://blog.csdn.net/qq_35979073/article/details/93171561

版权

Python 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

import requests
from lxml import etree
from aikanbao.MysqlHelper import *
import json

#奇热妙招网
number = 0
def get(page):
    global number
    print('第'+str(page)+'页')
    print('---------------------')
    # 加上User-Agent代理
    user_agent = {'User-Agent': '****'}
    tiaoshu = 0
    #数据库连接
    db = pymysql.connect("*****", "***", "****", "***")
    cursor = db.cursor()
    url = 'http://www.qire9.com/yinshi/index_'+str(page)+'.html'
    html = requests.get(url, headers=user_agent).content
    ele = etree.HTML(html)
    html_data = ele.xpath("//div[@class='mipui-widget-media-body mipui-category-list-001']/div[@class='mipui-category-list-item']")
    for i in html_data:
        title = i.xpath("div[@class='item-media']/a/@title")[0].strip()
        detail_url = i.xpath("div[@class='item-media']/a/@href")[0].strip()
        cover = i.xpath("div[@class='item-media']/a/mip-img/@src")[0].strip()
        if cover=='/public/assets/default/images/no-images.jpg':
            cover = ''
        remark = i.xpath("div[@class='item-content']/p[@class='description']/text()")[0].strip()
        created_at = i.xpath("div[@class='item-content']/p/span/text()")[0].strip()
        created_at = created_at+' 00:00:00'
        source_url = url
        #查询是否存在
        counSql = "select id as count from article  WHERE title = '%s'" % (title)
        cursor.execute(counSql)
        find = cursor.fetchone()
        if find:
            print('--已经存过了')
            continue
        #详情页面
        detail_response = requests.get(detail_url, headers=user_agent).text
        detail = etree.HTML(detail_response)
        p_list = detail.xpath("//section[@class='mip-box-body mipcms-detail-body']/p")
        #算出有多少个p标签
        p_count = 0
        for c in p_list:
            if c.xpath('text()') and c.xpath('text()')[0] =='猜你喜欢':
                break
            p_count +=1
        #最后一个是猜你喜欢 去掉一个P标签
        p_count = p_count-1
        j = 0
        jsonList = []
        while j <= p_count:
            table = detail.xpath("//section[@class='mip-box-body mipcms-detail-body']/p")[j]
            detail_content = etree.tostring(table, encoding='utf-8', method='html')
            detail_content = detail_content.decode('utf-8')
            jsonList.append(detail_content)
            j +=1
        content = json.dumps(jsonList, ensure_ascii=False)
        #插入数据库
        sql = "INSERT INTO article(source_type,type, title, cover,remark,content,source_url,created_at) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s')" % \
              (2, 3, title, cover, remark,content,source_url,created_at)
        result = cursor.execute(sql)
        db.commit()
        if result:
            number += 1
            print(title+'-------------保存成功')
            tiaoshu += 1
        else:
            print('-------------')
            print(detail_url)
            print(content)
            print('插入失败')
            exit()
            print('-------------')
    print('第'+str(page)+'页' + str(tiaoshu) + '条')
    return number


page = 3
while page >0:
    get(page)
    page -= 1
print('总共'+str(number)+'条')