xpath解析页面,爬取段子


import urllib.request
import urllib.parse
from lxml import etree
import os
import ssl
import json

ssl._create_default_https_context = ssl._create_unverified_context


url = 'http://neihanshequ.com/'
headers ={
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
}


jock_list = []

def main():
    request = urllib.request.Request(url=url,headers=headers)
    response = urllib.request.urlopen(request)

    html = response.read().decode('utf-8')
    html_tree = etree.HTML(html)
    li_list = html_tree.xpath('//div[@class="content"]/ul[@id="detail-list"]/li/div')

    for li in li_list:

        jock = {}
        #解析内容,获取头像、用户名、内容、赞、踩、收藏、分享、评论数据,保存到列表中
        image_url = li.xpath('.//div[@class="header "]/a/img/@data-src')
        user_name = li.xpath('.//div[@class="header "]/a/div/span[@class="name"]/text()')
        content = li.xpath('.//div[@class="content-wrapper"]/a/div/p/text()')
        support = li.xpath('.//div[@class="options"]/ul/li[@class="digg-wrapper "]/span[@class="digg"]/text()')
        tread = li.xpath('.//div[@class="options"]/ul/li[@class="bury-wrapper "]/span[@class="bury"]/text()')
        fovor = li.xpath('.//div[@class="options"]/ul/li[@class="repin-wrapper "]/span[@class="repin"]/text()')
        share = li.xpath('.//div[@class="options"]/ul/li[@class="share-wrapper right"]/span[@class="share"]/text()')
        comment = li.xpath('.//div[@class="options"]/ul/li[@class="post-comment-btn right"]/span[@class="comment J-comment-count"]/text()')

        jock['image_url']=image_url
        jock['user_name']=user_name
        jock['content']=content
        jock['support']=support
        jock['tread']=tread
        jock['fovor']=fovor
        jock['share']=share
        jock['comment']=comment
        print(jock)
        jock_list.append(jock)

    #保存文件
    with open('jock.text','w',encoding='utf-8') as f1:
        jock_string = json.dumps(jock_list,ensure_ascii=False)
        f1.write(jock_string)

if __name__ == '__main__':
    main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值