贴吧爬取

#coding:utf-8

import requests

lxml是类库,etree是模块

from lxml import etree
#from PIL import Image

class TiebaSpider(object):
def init(self):
self.headers = {“User-Agent” : “Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko”}
self.base_url = “http://tieba.baidu.com

    self.tieba_name = raw_input("请输入需要抓取的贴吧名:")
    self.begin_page = int(raw_input("请输入需要抓取的起始页:"))
    self.end_page = int(raw_input("请输入需要抓取的结束页:"))


def send_request(self, url, query_dict={}):
    response = requests.get(url, params=query_dict, headers=self.headers)
    return response


def parse_page(self, response):
    html = response.content
    print(type(html))
    #print(html)
    html_obj = etree.HTML(html)
    page_link_list = html_obj.xpath("//a[@class='j_th_tit ']/@href")
    return page_link_list


def parse_image(self, response):
    html = response.content

    html_obj = etree.HTML(html)
    image_link_list = html_obj.xpath("//img[@class='BDE_Image']/@src")
    return image_link_list


def save_image(self, response, filename):
    print("[INFO]: 正在保存图片 {}".format(filename))
    with open(filename, "wb") as f:
        f.write(response.content)


def main(self):
    # 帖子列表页数据抓取 提取 每个帖子链接
    for page in range(self.begin_page, self.end_page + 1):
        pn = (page - 1) * 50

        query_dict = {"kw" : self.tieba_name, "pn" : pn}
        full_url = self.base_url + "/f?"

        response = self.send_request(full_url, query_dict)
        page_link_list = self.parse_page(response)

        # 帖子详情页数据抓取 提取 每个图片链接
        for page_link in page_link_list:
            page_full_url = self.base_url + page_link
            page_response = self.send_request(page_full_url)

            image_link_list = self.parse_image(page_response)

            # 图片数据抓取并保存
            for image_link in image_link_list:
                image_response = self.send_request(image_link)

                self.save_image(image_response, image_link[-15:])

if name == ‘main’:
spider = TiebaSpider()
spider.main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值