Python爬虫教程—python爬取糗事百科详解

最新推荐文章于 2020-12-22 15:54:22 发布

CN-AllenRen

最新推荐文章于 2020-12-22 15:54:22 发布

阅读量782

点赞数

分类专栏： python---爬虫文章标签： python 糗事百科爬虫

本文链接：https://blog.csdn.net/RKun595/article/details/61205441

版权

python---爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

可以访问我的网站：http://www.cjluzzl.cn

爬虫功能：

1.爬取糗事百科段子并保存到本地文件

爬取到的文本展示

2.爬取糗事百科上的图片并保存到本地

先来原代码一睹为快

# -*- coding: utf-8 -*-
'''
Created on 2017年3月4日

@author: cjluzzl
'''

import re
import urllib2
import urllib
import os

def download(url):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    header = {'User-Agent':user_agent}
    request = urllib2.Request(url,headers=header)
    print '正在下载',url
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print '打开网址 ',url,e.reason    
    
    return html
    
file = open(u"糗事百科.txt",'w+')
html = download('https://www.qiushibaike.com')
if html != None:
    con = re.findall("<span>(.*?)</span>",html)
    total_count = len(con)
    
    
    print '一共有',total_count,'条'
    file.write('一共有'+str(total_count)+'条\n')
    for i in range(1,total_count):
        print '\n这是第',i,'条'
        file.write('\n这是第'+str(i)+'条\n')
        if '<br/><br/>' in con[i]:
            con[i]=con[i].replace('<br/><br/>','\n')
        if '<br/>' in con[i]:
            con[i]=con[i].replace('<br/>','\n')
        print con[i]
        if "<img " in con[i]:
            print u'现在开始爬取图片'
            #imageUrl = con[i][con[i].find('src=')+5:con[i].find('.jpg')+4]
            imageUrl = re.findall('src="(.+?\.jpg)',con[i])
            title = re.findall('alt="(.*?)"',con[i])
            title = str(title[0])
            print title
            filename="D:/Download/" + title +".jpg"
            #print filename
                
            try:
                urllib.urlretrieve(imageUrl[0],unicode(filename,'utf8'))
            except IOError as a:
                print 'error'
        else:
            file.write(con[i]+'\n')
    
   
            
else:
    print '未获取到指定内容，请检查网址无误后重试'
file.close()

先看download函数,用于下载html源码

def download(url):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    header = {'User-Agent':user_agent}
    request = urllib2.Request(url,headers=header)
    print '正在下载',url
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print '打开网址 ',url,e.reason    
    
    return html

读取段子和图片并写入文件

注意：

1.用Firebug查看html，逐渐站看标签，查看内容，注意分析源码编写正则表达式

2.re.findall()方法返回的是列表

3.unicode(filename,'utf8')防止Windows下文件名乱码

file = open(u"糗事百科.txt",'w+')#创建本地文件
html = download('https://www.qiushibaike.com') #执行download函数下载HTML
if html != None:
    con = re.findall("<span>(.*?)</span>",html)#获取段子内容
    total_count = len(con)#计算段子总数
    
    
    print '一共有',total_count,'条'
    file.write('一共有'+str(total_count)+'条\n')
    for i in range(1,total_count):
        print '\n这是第',i,'条'
        file.write('\n这是第'+str(i)+'条\n')
        if '<br/><br/>' in con[i]:#<br/>标签转换行
            con[i]=con[i].replace('<br/><br/>','\n')
        if '<br/>' in con[i]:#<br/>标签转换行
            con[i]=con[i].replace('<br/>','\n')
        print con[i]
        if "<img " in con[i]:#判断是否为图片段子
            print u'现在开始爬取图片'
            #imageUrl = con[i][con[i].find('src=')+5:con[i].find('.jpg')+4]
            imageUrl = re.findall('src="(.+?\.jpg)',con[i])
            title = re.findall('alt="(.*?)"',con[i])
            title = str(title[0])
            print title
            filename="D:/Download/" + title +".jpg"
            #print filename
            try:
                urllib.urlretrieve(imageUrl[0],fielname=unicode(filename,'utf8'))
            except IOError as a:
                print 'error'
        else:#文字段子写入文件
            file.write(con[i]+'\n')
    
   
            
else:
    print '未获取到指定内容，请检查网址无误后重试'
file.close()

CN-AllenRen

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
Python爬虫教程—python爬取糗事百科详解

可以访问我的网站：http://www.cjluzzl.cn爬虫功能：1.爬取糗事百科段子并保存到本地文件爬取到的文本展示2.爬取糗事百科上的图片并保存到本地先来原代码一睹为快# -*- coding: utf-8 -*-'''Created on 2017年3月4日@author: cjluzzl'''import reimport u
复制链接

扫一扫