python 爬取贴吧信息

# -*- coding: utf-8 -*-

import urllib.request as urllib2
import json
import os
from lxml import etree


def get_tz_id(tb_name, page_num):
    tz_id = []
    for page in range(1, page_num + 1):
        url = "http://tieba.baidu.com/f?kw=%s&pn=%s" % (tb_name, (page * 50 - 50))
        html = urllib2.urlopen(url).read()
        tree = etree.HTML(html)
        ul_li = tree.xpath('//*[@id="thread_list"]/li')[1:]
        for li in ul_li:
            data_field = li.xpath('./@data-field')  # 滤掉百度推广部分
            if data_field:
                id_ = json.loads(data_field[0])['id']
                tz_id.append(id_)

    return tz_id


def get_info(i, tz_id):
    path_dir = './data/' + str(i)

    if os.path.exists(path_dir) is False:
        os.makedirs(path_dir)

    tz_url = 'http://tieba.baidu.com/p/%s' % tz_id
    html = urllib2.urlopen(tz_url).read()
    html = html.decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')

    a = soup.find_all(name='cc')[0].find_all(class_='d_post_content j_d_post_content clearfix')
    text_content = a[0].get_text().strip()
    img_content = a[0].find_all('img')

    f = open(path_dir + '/content.txt', 'a')
    f.write(str(text_content))
    f.write('\n')

    # 获取图片url
    if len(img_content) == 0:
        print("此条帖子不存在图片")

    else:
        image_url_list = []
        for i in range(len(img_content)):
            image_url_list.append(img_content[i]['src'])

        import requests

        x = 0
        for url in image_url_list:
            r = requests.get(url)
            path = path_dir + '/' + str(x) + '.jpg'
            with open(path, "wb") as f:
                f.write(r.content)
            print('第' + str(x) + '个爬取完成')
            x = x + 1


def main():
    id_list = get_tz_id('nct', 1)
    print(id_list)
    for i, each in enumerate(id_list):
        import time
        time.sleep(2)
        print(i, each)
        get_info(i, each)

main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值