一题三解百度贴吧源码任性爬

先上一道普通菜色,一般性代码爬取

headers_list = [{
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
                  'like Gecko) Chrome/81.0.4044.129 Safari/537.36'}, {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/39.0.2171.71 Safari/537.36'}, {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) '
                  'Chrome/23.0.1271.64 Safari/537.11'}, {
    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, '
                  'like Gecko) Chrome/10.0.648.133 Safari/534.16'}]
headers = random.choice(headers_list)
print(headers)
baseurl = "https://www.baidu.com/s?"
name = input("请输入贴吧名:")
start = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
kw = {'kw': name}
kw = urllib.parse.urlencode(kw)
# 拼接url 发送请求或响应  保存数据
for i in range(start, end + 1):
    # 拼接url
    pn = (i - 1) * 50
    baseurl = 'https://tieba.baidu.com/f?'
    url = baseurl + kw + '&pn=' + str(pn)

    # 发起请求
    req = urllib.request.Request(url, headers=headers)
    res = urllib.request.urlopen(req)
    html = res.read().decode('utf-8')

    # 写入文件
    filename = '第' + str(i) + '页.html'
    with open(filename, 'w', encoding='utf-8')as f:
        print('正在爬取{0}贴吧第{1}张网页'.format(name, i))
        f.write(html)

将代码封装到函数进行攫取

def readPage(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
                      'like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
    # 创建请求对象
    req = urllib.request.Request(url, headers=headers)
    # 获取响应对象
    response = urllib.request.urlopen(req)
    # 读取响应对象内容
    html = response.read().decode('utf-8')
    return html


# 写入文件
def writePage(filename, html, name, i):
    with open(filename, 'w', encoding='utf-8')as f:
        f.write(html)
    return '正在爬取{0}贴吧第{1}张网页'.format(name, i)


# 主函数
def main():
    name = input('请输入贴吧名:')
    start = int(input('请输入爬取的首页:'))
    end = int(input('请输入爬取的尾页:'))
    kw = {'kw': name}
    kw = urllib.parse.urlencode(kw)
    for i in range(start, end + 1):
        # 拼接url
        pn = (i - 1) * 50
        baseurl = 'https://tieba.baidu.com/f?'
        url = baseurl + kw + '&pn=' + str(pn)
        html_get = readPage(url)
        filename = name + '贴吧第' + str(i) + '页.html'
        printing = writePage(filename, html_get, name, i)
        print(printing)


if __name__ == '__main__':
    main()

使用面向对象的方式攫取

class BaiduSpider:
    def __init__(self):
        # 常用的不变的放在init方法里
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
                          'like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
        self.baseurl = 'https://www.baidu.com/s?'

    def readPage(self, url):
        # 创建请求对象
        req = urllib.request.Request(url, headers=self.headers)
        # 获取响应对象
        response = urllib.request.urlopen(req)
        # 读取响应对象内容
        html = response.read().decode('utf-8')
        return html

    # 写入文件
    def writePage(self, filename, html, name, i):
        with open(filename, 'w', encoding='utf-8')as f:
            f.write(html)
        return ('正在爬取{0}贴吧第{1}张网页'.format(name, i))

    # 主函数
    def main(self):
        name = input('请输入贴吧名:')
        start = int(input('请输入爬取的首页:'))
        end = int(input('请输入爬取的尾页:'))
        kw = {'kw': name}
        kw = urllib.parse.urlencode(kw)
        for i in range(start, end + 1):
            # 拼接url
            pn = (i - 1) * 50
            url = self.baseurl + kw + '&pn=' + str(pn)
            html_get = self.readPage(url)
            filename = name + '贴吧第' + str(i) + '页.html'
            printing = self.writePage(filename, html_get, name, i)
            print(printing)


if __name__ == '__main__':
    # 若要调用类对象中的函数,则需要将该类实例化
    spider = BaiduSpider()
    spider.main()

大家爬取目标网站的时候一定要符合该网站spider rules,切不可作出违法的事

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值