使用BeautifulSoup模块获取糗事百科上的笑话

更多解释在打开链接,这里使用BeautifulSoup爬取

#!/usr/bin/python
#coding: utf-8

from bs4 import BeautifulSoup
import re, sys, urllib, urllib2
reload(sys)
sys.setdefaultencoding( "utf-8" )

while True:
    url = "http://www.qiushibaike.com/hot/page/"

    try:
        x = int(raw_input(u"请输入一个数字(输入0结束), 荤段子只有35页:"))
    except Exception as e:
        print e
        print u"请输入数字"
        continue

    if x == 0:
        break
    url = url + str(x) + "/"

    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

    try:
        html = urllib2.Request(url, headers = headers)
        html = urllib2.urlopen(html).read()

        html = html.decode("utf-8")

        soup = BeautifulSoup(html, "lxml")
        items = soup.find_all("div", {"class" : "content"})
        sys.stdout.write("\n")
        sys.stdout.write(u"第%d页\n" % x)
        sys.stdout.write("\n")
        for x, item in zip(range(1, len(items) + 1), items):
            sys.stdout.write(u"第%d条" % x)
            sys.stdout.write("\n")
            sys.stdout.write(item.get_text())
            sys.stdout.write("\n")

    except Exception as e:
        print e
        print u"出错了,无法链接糗事百科!"


使用类封装

#!/usr/bin/python
#coding: utf-8

import re, sys, urllib, urllib2
from bs4 import BeautifulSoup

class Qiushi_spider(object):

    def __init__(self, x):
        self.x = x
        self.url = "http://www.qiushibaike.com/hot/page/" + str(self.x) + "/"

    def find_out(self):

        try:
            headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

            html = urllib2.Request(self.url, headers = headers)
            html = urllib2.urlopen(html).read()

            soup = BeautifulSoup(html, "lxml")
            items = soup.find_all("div", {"class", "content"})

            sys.stdout.write("\n")
            sys.stdout.write(u"第%d页\n" % self.x)
            sys.stdout.write("\n")

            for num, item in zip(range(1, len(items) + 1), items):
                sys.stdout.write(u"第%d条" % num)
                sys.stdout.write("\n")
                sys.stdout.write(item.get_text())
                sys.stdout.write("\n")
        except Exception as e:
            print e
            print u"无法连接到糗事百科,请重新输入"

if __name__ == "__main__":
    while True:
        try:
            x = int(raw_input(u"请输入一个数字(输入0结束), 荤段子只有35页:"))

            if x == 0:
                break

            spider = Qiushi_spider(x)
            spider.find_out()

        except Exception as e:
            print e
            print u"输入出错了,请重新输入"


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值