Python爬虫（一）——爬取糗事百科

最新推荐文章于 2024-11-05 15:28:12 发布

hehe__eheh

最新推荐文章于 2024-11-05 15:28:12 发布

阅读量398

点赞数

分类专栏： Python爬虫文章标签： python 爬虫

本文链接：https://blog.csdn.net/hehe__eheh/article/details/50674506

版权

Python爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

#! /usr/bin/python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
 
################version1####################
#url = 'http://www.qiushibaike.com/textnew/'
#user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
#headers = { 'User-Agent' : user_agent }
#try:
#    request = urllib2.Request(url,headers = headers)
#    response = urllib2.urlopen(request)
#    print response.read()
#except urllib2.URLError, e:
#    if hasattr(e,"code"):
#        print e.code
#    if hasattr(e,"reason"):
#        print e.reason


################version2####################
#加入正则表达式提取所需内容
#url = 'http://www.qiushibaike.com/textnew/'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
page_num = 1
for page in range(1,page_num + 1):
    try:
        url = 'http://www.qiushibaike.com/text/page/' + str(page)
        request = urllib2.Request(url,headers = headers)
        response = urllib2.urlopen(request)
        #print response.read()
        content = response.read().decode('utf-8')
        #pattern = re.compile('<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?'+
        #                     'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',re.S)
        #pattern = re.compile('<div.*?author.*?<h2>(.*?)</h2>.*?<div.*?content">(.*?)<!--[0-9]*-->',re.S)
        pattern = re.compile('<div.*?author.*?<h2>(.*?)</h2>.*?<div.*?content">(.*?)<!--[0-9]*-->.*?<span.*?stats-vote.*?><i.*?>([0-9]*?)<.*?</span>.*?<i.*?>([0-9]*)</i>',re.S)
        items = re.findall(pattern, content)
        for item in items:
            print '*' * 90
            #print '\t'.join(item)
            #print item.replace('<br/>', '\n')
            print '\n'.join(item).replace('<br/>', '\n')
    except urllib2.URLError, e:
        if hasattr(e,"code"):
            print e.code
        if hasattr(e,"reason"):
            print e.reason