Python爬虫(一)——爬取糗事百科

#! /usr/bin/python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
 
################version1####################
#url = 'http://www.qiushibaike.com/textnew/'
#user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
#headers = { 'User-Agent' : user_agent }
#try:
#    request = urllib2.Request(url,headers = headers)
#    response = urllib2.urlopen(request)
#    print response.read()
#except urllib2.URLError, e:
#    if hasattr(e,"code"):
#        print e.code
#    if hasattr(e,"reason"):
#        print e.reason


################version2####################
#加入正则表达式提取所需内容
#url = 'http://www.qiushibaike.com/textnew/'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
page_num = 1
for page in range(1,page_num + 1):
    try:
        url = 'http://www.qiushibaike.com/text/page/' + str(page)
        request = urllib2.Request(url,headers = headers)
        response = urllib2.urlopen(request)
        #print response.read()
        content = response.read().decode('utf-8')
        #pattern = re.compile('<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?'+
        #                     'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',re.S)
        #pattern = re.compile('<div.*?author.*?<h2>(.*?)</h2>.*?<div.*?content">(.*?)<!--[0-9]*-->',re.S)
        pattern = re.compile('<div.*?author.*?<h2>(.*?)</h2>.*?<div.*?content">(.*?)<!--[0-9]*-->.*?<span.*?stats-vote.*?><i.*?>([0-9]*?)<.*?</span>.*?<i.*?>([0-9]*)</i>',re.S)
        items = re.findall(pattern, content)
        for item in items:
            print '*' * 90
            #print '\t'.join(item)
            #print item.replace('<br/>', '\n')
            print '\n'.join(item).replace('<br/>', '\n')
    except urllib2.URLError, e:
        if hasattr(e,"code"):
            print e.code
        if hasattr(e,"reason"):
            print e.reason

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值