xpath爬取首页信息,并获取详情页标题与时间

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/han_yanlong/article/details/76919000
# -*- coding: utf-8 -*-
# url为 伯乐在线文章首页
import sys
import requests
from lxml import etree
import random
import codecs
reload(sys)
sys.setdefaultencoding("utf-8")
def download_page(url):
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50"
    ]

    headers = {
        "User-Agent": random.choice(user_agent_list)
    }
    response = requests.get(url=url, headers=headers)
    if response and response.status_code == 200:
        return response.content
    else:
        return None

# 爬取首页,第二页,第三页.....
def parse_page(url):
    content = download_page(url)
    if content:
        doc = etree.HTML(content)
        if doc is not None:
            div_list = doc.xpath("//div[@class='post floated-thumb']")
            for div in div_list:
                a = div.xpath("div[@class='post-thumb']/a")[0]
                detail_url = a.xpath("@href")[0]
                img_src = a.xpath("img/@src")[0]
                parse_detail(detail_url, img_src)
        else:
            print "该页面解析失败", url
    else:
        # 可以尝试再下载一次
        pass

# 爬取详情页
def parse_detail(url, src):
    content = download_page(url)
    if content:
        doc = etree.HTML(content)
        if doc is not None:
            title = doc.xpath("//h1/text()")[0]
            date_time = doc.xpath("//p[@class='entry-meta-hide-on-mobile']/text()[1]")[0]
            date_time = date_time.strip().replace("·","").strip()
            print title, date_time
            print '-------------------------'
        else:
            print "该页面详情解析失败", url
    else:
        # 可以尝试再下载一次
        pass


# 保存数据到本地
def save_to_file():
    pass

def main():
    parse_page("http://blog.jobbole.com/all-posts/ ")

if __name__ == '__main__':
    main()

展开阅读全文

没有更多推荐了,返回首页