#! /usr/bin/python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
################version1####################
#url = 'http://www.qiushibaike.com/textnew/'
#user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
#headers = { 'User-Agent' : user_agent }
#try:
# request = urllib2.Request(url,headers = headers)
# response = urllib2.urlopen(request)
# print response.read()
#except urllib2.URLError, e:
# if hasattr(e,"code"):
# print e.code
# if hasattr(e,"reason"):
# print e.reason
################version2####################
#加入正则表达式提取所需内容
#url = 'http://www.qiushibaike.com/textnew/'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
page_num = 1
for page in range(1,page_num + 1):
try:
url = 'http://www.qiushibaike.com/text/page/' + str(page)
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
#print response.read()
content = response.read().decode('utf-8')
#pattern = re.compile('<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?'+
# 'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',re.S)
#pattern = re.compile('<div.*?author.*?<h2>(.*?)</h2>.*?<div.*?content">(.*?)<!--[0-9]*-->',re.S)
pattern = re.compile('<div.*?author.*?<h2>(.*?)</h2>.*?<div.*?content">(.*?)<!--[0-9]*-->.*?<span.*?stats-vote.*?><i.*?>([0-9]*?)<.*?</span>.*?<i.*?>([0-9]*)</i>',re.S)
items = re.findall(pattern, content)
for item in items:
print '*' * 90
#print '\t'.join(item)
#print item.replace('<br/>', '\n')
print '\n'.join(item).replace('<br/>', '\n')
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
Python爬虫(一)——爬取糗事百科
最新推荐文章于 2024-11-05 15:28:12 发布