可以访问我的网站:http://www.cjluzzl.cn
爬虫功能:
1.爬取糗事百科段子并保存到本地文件
爬取到的文本展示
2.爬取糗事百科上的图片并保存到本地
先来原代码一睹为快
# -*- coding: utf-8 -*-
'''
Created on 2017年3月4日
@author: cjluzzl
'''
import re
import urllib2
import urllib
import os
def download(url):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
header = {'User-Agent':user_agent}
request = urllib2.Request(url,headers=header)
print '正在下载',url
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print '打开网址 ',url,e.reason
return html
file = open(u"糗事百科.txt",'w+')
html = download('https://www.qiushibaike.com')
if html != None:
con = re.findall("<span>(.*?)</span>",html)
total_count = len(con)
print '一共有',total_count,'条'
file.write('一共有'+str(total_count)+'条\n')
for i in range(1,total_count):
print '\n这是第',i,'条'
file.write('\n这是第'+str(i)+'条\n')
if '<br/><br/>' in con[i]:
con[i]=con[i].replace('<br/><br/>','\n')
if '<br/>' in con[i]:
con[i]=con[i].replace('<br/>','\n')
print con[i]
if "<img " in con[i]:
print u'现在开始爬取图片'
#imageUrl = con[i][con[i].find('src=')+5:con[i].find('.jpg')+4]
imageUrl = re.findall('src="(.+?\.jpg)',con[i])
title = re.findall('alt="(.*?)"',con[i])
title = str(title[0])
print title
filename="D:/Download/" + title +".jpg"
#print filename
try:
urllib.urlretrieve(imageUrl[0],unicode(filename,'utf8'))
except IOError as a:
print 'error'
else:
file.write(con[i]+'\n')
else:
print '未获取到指定内容,请检查网址无误后重试'
file.close()
先看download函数,用于下载html源码
def download(url):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
header = {'User-Agent':user_agent}
request = urllib2.Request(url,headers=header)
print '正在下载',url
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print '打开网址 ',url,e.reason
return html
读取段子和图片并写入文件
注意:
1.用Firebug查看html,逐渐站看标签,查看内容,注意分析源码编写正则表达式
2.re.findall()方法返回的是列表
3.unicode(filename,'utf8')防止Windows下文件名乱码
file = open(u"糗事百科.txt",'w+')#创建本地文件
html = download('https://www.qiushibaike.com') #执行download函数下载HTML
if html != None:
con = re.findall("<span>(.*?)</span>",html)#获取段子内容
total_count = len(con)#计算段子总数
print '一共有',total_count,'条'
file.write('一共有'+str(total_count)+'条\n')
for i in range(1,total_count):
print '\n这是第',i,'条'
file.write('\n这是第'+str(i)+'条\n')
if '<br/><br/>' in con[i]:#<br/>标签转换行
con[i]=con[i].replace('<br/><br/>','\n')
if '<br/>' in con[i]:#<br/>标签转换行
con[i]=con[i].replace('<br/>','\n')
print con[i]
if "<img " in con[i]:#判断是否为图片段子
print u'现在开始爬取图片'
#imageUrl = con[i][con[i].find('src=')+5:con[i].find('.jpg')+4]
imageUrl = re.findall('src="(.+?\.jpg)',con[i])
title = re.findall('alt="(.*?)"',con[i])
title = str(title[0])
print title
filename="D:/Download/" + title +".jpg"
#print filename
try:
urllib.urlretrieve(imageUrl[0],fielname=unicode(filename,'utf8'))
except IOError as a:
print 'error'
else:#文字段子写入文件
file.write(con[i]+'\n')
else:
print '未获取到指定内容,请检查网址无误后重试'
file.close()