# -*- coding: utf-8 -*- import urllib.request as urllib2 import json import os from lxml import etree def get_tz_id(tb_name, page_num): tz_id = [] for page in range(1, page_num + 1): url = "http://tieba.baidu.com/f?kw=%s&pn=%s" % (tb_name, (page * 50 - 50)) html = urllib2.urlopen(url).read() tree = etree.HTML(html) ul_li = tree.xpath('//*[@id="thread_list"]/li')[1:] for li in ul_li: data_field = li.xpath('./@data-field') # 滤掉百度推广部分 if data_field: id_ = json.loads(data_field[0])['id'] tz_id.append(id_) return tz_id def get_info(i, tz_id): path_dir = './data/' + str(i) if os.path.exists(path_dir) is False: os.makedirs(path_dir) tz_url = 'http://tieba.baidu.com/p/%s' % tz_id html = urllib2.urlopen(tz_url).read() html = html.decode("utf-8") soup = BeautifulSoup(html, 'html.parser') a = soup.find_all(name='cc')[0].find_all(class_='d_post_content j_d_post_content clearfix') text_content = a[0].get_text().strip() img_content = a[0].find_all('img') f = open(path_dir + '/content.txt', 'a') f.write(str(text_content)) f.write('\n') # 获取图片url if len(img_content) == 0: print("此条帖子不存在图片") else: image_url_list = [] for i in range(len(img_content)): image_url_list.append(img_content[i]['src']) import requests x = 0 for url in image_url_list: r = requests.get(url) path = path_dir + '/' + str(x) + '.jpg' with open(path, "wb") as f: f.write(r.content) print('第' + str(x) + '个爬取完成') x = x + 1 def main(): id_list = get_tz_id('nct', 1) print(id_list) for i, each in enumerate(id_list): import time time.sleep(2) print(i, each) get_info(i, each) main()
python 爬取贴吧信息
最新推荐文章于 2021-12-04 14:30:58 发布