Python爬虫初探——天涯

最新推荐文章于 2021-03-16 11:05:40 发布

闫小玉

最新推荐文章于 2021-03-16 11:05:40 发布

阅读量1.1k

点赞数 1

分类专栏： Python 文章标签： Python爬虫

本文链接：https://blog.csdn.net/weixin_43240274/article/details/91378146

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

天涯论坛爬取全部博文，保存为word文档

一切工作从目录开始

[注：全部过程先看了一遍 https://www.jianshu.com/p/81a5da4fa161 简书，理清思路]

首先我们要从博主的主页面入手，将显示的博文一篇一篇爬取下来。主页面如下
（网页url： http://www.tianya.cn/6090416/bbs ）
在这里插入图片描述
爬取目录文章的URL是我们的目标。重点在于我们如何从前一篇文章的url得到后一篇文章的URL，这样就可以完全实现全自动的爬取了。然后把爬取的文章url放在txt文档之后调用即可。
首先我遵从人类进化起源于懒惰这一至理名言，希望利用八爪鱼可视化爬虫软件完成第一步骤，遂卒。。。因为从调试中我们可以看到是无法直接读取目录信息的，是简书所说接口难寻，也就是，因为这部分是异步加载，需要我们去得到目录页的真实URL。F12打开network找到bbsuser后的hearder，Request URL: http://www.tianya.cn/api/bbsuser?method=userinfo.ice.getUserTotalArticleList&params.userId=6090416&params.pageSize=20&params.bMore=true&params.kindId=-1&params.publicNextId=2147483647&params.techNextId=2147483647&params.cityNextId=2147483647
这才是我们所需要的URL。
在这里插入图片描述

分析前后两篇文章的URL可以发现前后的URL是有联系的，即前一个的"public_next_id"字段就是后面一篇URL的params.publicNextId、params.techNextId、params.cityNextId的内容。于是可以正则表达提取合并从而得到全部文章的URL。代码如下:

# -*- coding: utf-8 -*-
"""
Created on Mon Jun  3 12:28:32 2019

@author: Administrator
"""
from pyspider.libs.base_handler import *
import urllib.request
import re

filepath ='D:\Ayu\\website.txt'
filehandle = open(filepath,'w')

art_list ='http://www.tianya.cn/api/bbsuser?method=userinfo.ice.getUserTotalArticleList&params.userId=6090416&params.pageSize=20&params.bMore=true&params.kindId=-1&params.publicNextId=2147483647&params.techNextId=2147483647&params.cityNextId=2147483647'
link = urllib.request.urlopen(art_list)
html = link.read().decode('utf-8')
#print (html)


get_id = re.compile(r'"art_id":"(.*?)","item"')  #[\S\s]匹配任意字符
art_id = re.findall(get_id,html)
#print(len(art_id))

get_type = re.compile(r'"item":"(.*?)","extend"')  #[\S\s]匹配任意字符
item_type = re.findall(get_type,html)
#print(item_type)

i=1
l=1
while l:
    if len(art_id) !=0:
        str1=""
        for index in range(0,len(art_id)):
            str1= 'http://bbs.tianya.cn/post-'+ item_type[index] +'-'+art_id[index]+'-1.shtml'
            filehandle.write(str1+'\n')
            print(str1)
        get_nextid = re.compile(r'"public_next_id":"(.*?)","tech_next_id"')
        next_id = re.findall(get_nextid,html)
        #print(next_id)

        next_list=""
        next_list ='http://www.tianya.cn/api/bbsuser?method=userinfo.ice.getUserTotalArticleList&params.userId=6090416&params.pageSize=20&params.bMore=true&params.kindId=-1&params.publicNextId='+next_id[0]+'&params.techNextId='+next_id[0]+'&params.cityNextId='+next_id[0]
        #print (art_list)
        link = urllib.request.urlopen(next_list)
        html = link.read().decode('utf-8')    
        
        get_id = re.compile(r'"art_id":"(.*?)","item"')  #[\S\s]匹配任意字符
        art_id = re.findall(get_id,html)
        #print(len(art_id))

        get_type = re.compile(r'"item":"(.*?)","extend"')  #[\S\s]匹配任意字符
        item_type = re.findall(get_type,html)
        #print(item_type)        
        
        str1 = '============================'+'第'+ str(i) +' 页'+'=============================='
        print(str1)
        i=i+1
        
    else:
        l=0

每个文章的URL得到了，终于要开始爬文章和图片

人类的本质是复读机————某至理名言

此部分框架用的是csdn上这位大哥的 https://blog.csdn.net/koanzhongxue/article/details/45709861 ，其次需要我们按照自己情况来修改的地方（不得不说虽然re正则表达式是最基础最灵活的爬虫工具，但能够灵活运用还是需要火候，比如我在爬取的时候就因为没有匹配好前后值，导致总是爬到某一篇文章就断掉，因为那篇文章的结构和其他的不一样。相比下来beautiful soup是真的好用，上手快且不需要处理太多异常情况）
其中我们使用了Python的doc库函数，re正则化，urlib.request Python3访问url函数，image读取函数，beautifulsoup和time函数推迟调用线程的运行

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
import urllib.request
import re
import _locale
from bs4 import BeautifulSoup
import os
import time
import requests
from docx import Document
from docx.shared import Inches
from PIL import Image


_locale._getdefaultlocale = (lambda *args: ['zh_CN', 'utf8'])




inde=1
for line in open("D:\\Ayu\\files\\readWebsite0.txt","r",encoding='utf-8'):
    
    link = urllib.request.urlopen(line)
    html = link.read().decode('utf-8')
    #print (html)
    
    gettitle = re.compile(r'<title>(.*?)</title>')
    title = re.findall(gettitle,html)
    print(title)
    #获取文章标题
    if os.path.exists(title[0]+'.doc'):			#为了防止900多篇文章若出错 不要再从头开始了
        print("该文章已存在")
        continue
    else:
        getmaxlength = re.compile(r'<a href=".*?">(\d*)</a>\s*<a href=".*?"\s*class=.*?>下页</a>')
        #用正则获取最大页数信息
        if getmaxlength.search(html):
            maxlength = getmaxlength.search(html).group(1)
            print(maxlength)
        else:
            maxlength=1
    
        #*************************************************************************
        #正则匹配 除所有的帖子内容
        gettext = re.compile(r'<div class="bbs-content">\s*([\S\s]*?)\s*</div>')  #[\S\s]匹配任意字符
        gettext1 = re.compile(r'<div class="bbs-content clearfix">(.*?)</div>')
        origin_soup=BeautifulSoup(html,'lxml')
        content= origin_soup.find(class_="bbs-content clearfix")
        getpagemsg = re.compile(r'<div class="atl-info">\s*<span>(.*?)<a href="http:.*uname="(.*)">.*\s*<span>时间：(.*?)</span>')
        getnextpagelink = re.compile(r'<a href="(.*?)"\s*class.*?>下页</a>')
        #遍历每一页,获取发帖作者，时间，内容，并打印
        content_image= origin_soup.findAll('img',)
        #print(content)
        #print(content_image)

        #指定文件路径
        path = os.getcwd()
        new_path = os.path.join(path, 'pictures')
        if not os.path.isdir(new_path):
            os.mkdir(new_path)
        new_path += '/'
        #print(new_path)
    
        #下载图片
        image_couter=1
        #requests.DEFAULT_RETRIES = 5
        for img in content_image:
            #s = requests.session()
            #s.keep_alive = False
            
            img_url=img.get('original')
            #print (img_url)
            if img_url is not None:
                try:
                    url_content = requests.get(img_url+line)# 可以
                except:
                    print(u'HTTP请求失败！正在准备重发...')
                    time.sleep(2)
                    continue
            
                url_content.encoding = url_content.apparent_encoding
                img_name = '%s.jpg' % image_couter
            
                with open(os.path.join(img_name), 'wb') as f:
                    f.write(url_content.content)
                    f.close()
                    #time.sleep(3)
                image_couter += 1
                  
                print('下载图片完成')
       
        ############################写入#############################
        #filepath ='D:\\Ayu\\files\\'+title[0]+'.doc' #utf-8编码，需要转为gbk .decode('utf-8').encode('gbk')
        #
        #filehandle = open(filepath,'a+')
        ##打印文章标题
        #filehandle.write(title[0] +'\n')
        doc = Document()
        doc.add_paragraph(title[0] +'\n')
        
        str1 = ""
        for pageno in range(1,int(maxlength)+1):
            i = 0
            
            str1 = '============================'+'第 '+ str(pageno)+' 页'+'=============================='
            print (str1)
            doc.add_paragraph(str1 +'\n')
            #获取每条发言的信息头，包含作者，时间
            #print(type(html))
            
            pagemsg = re.findall(getpagemsg,html)
            if  pageno is 1:
                #获取第一个帖子正文
                # html中的&nbsp,在转换成文档后,变成\t\n\r
                if content is None:
                    gettext1 = ''
                else:
                    gettext1 = content.text.replace('\t','').replace('\r','').replace('\n','')               
                    
                 #因为第一条内容由text1获取，text获取剩下的，所以text用i-1索引
            #获取帖子正文
            text = re.findall(gettext,html)
            for ones in pagemsg:
                if pageno > 1:
                    if ones is pagemsg[0]:
                        continue   #若不是第一页，跳过第一个日期
            
                if 'host' in ones[0]:
                    str1= '楼主：'+ ones[1] +'     时间:' + ones[2]
                    doc.add_paragraph(str1 +'\n')
                else:
                    str1= ones[0] + ones[1] + '    时间:' + ones[2]
                    doc.add_paragraph(str1+'\n')
                    
                    
                if  pageno is 1: #第一页特殊处理
                    if i is 0:
                        str1= gettext1.split('\u3000\u3000') 
                        img_index=1
                        for j in range(1,len(str1)):
                            #print(str1[j])
                            if len(str1[j]) == 0:  
                                images = '%s.jpg'%img_index
                                #print('此处插入图片')
                                img_index = img_index+1
                                try:
                                    doc.add_picture(images, width=Inches(4))     # 添加图, 设置宽度
                                    os.remove(images)#删除保存在本地的图片
                                except Exception:
                                    continue
                                    
                            else:
                                doc.add_paragraph(str1[j])
                                #print('此处插入段落')
                    else:                            
                        str1 = text[i-1].replace('<br>','\n').replace('title="点击图片查看幻灯模式"','\n')
                        str0 = str1.split()
                        #print(str0)
                        for m in range(0,len(str0)):
                            if str0[m].startswith('<img'):
                                images = '%s.jpg'%img_index
                                #print('此处插入图片')
                                img_index = img_index+1
                                try:
                                    doc.add_picture(images, width=Inches(4))     # 添加图, 设置宽度
                                    os.remove(images)#删除保存在本地的图片
                                except Exception:
                                    continue
                            else:
                                #print(str1)
                                doc.add_paragraph(str0[m] +'\n')
                                #print('cichu插入段落')
                        
                        
                else:  #非第一页的处理                        
                    str1 = text[i].replace('<br>','\n').replace('title="点击图片查看幻灯模式"','\n')
                    str2 = str1.split()
                    #print(str2)
                    for k in range(0,len(str2)):
                        if str2[k].startswith('<img'):
                            images = '%s.jpg'%img_index
                            #print('此处插入图片')
                            img_index = img_index+1
                            try:
                                doc.add_picture(images, width=Inches(4))     # 添加图, 设置宽度
                                os.remove(images)#删除保存在本地的图片
                            except Exception:
                               continue
                        else:
                            try:
                                #print(str1)
                                doc.add_paragraph(str2[k] +'\n')
                                #print('插入段落')
                            except IndexError as e:
                                print ('error occured at >>pageno:'+str(pageno)+'   line:'+str(i))
                                print ('>>'+text[i-1])
                                print (e)
            
        
                i = i +1
            if pageno < int(maxlength):
                #获取帖子的下一页链接，读取页面内容
                nextpagelink = 'http://bbs.tianya.cn'+getnextpagelink.search(html).group(1)
                #print(type(html))
                link = urllib.request.urlopen(nextpagelink)
                html = link.read().decode('utf-8')

        doc.save(title[0]+'.doc')

图片下载出了一些大问题

https://blog.csdn.net/weixin_40420401/article/details/82384049>python 3.x 爬虫基础—http headers详解（转）
https://blog.csdn.net/hu77700021/article/details/79837569 爬虫反爬-关于headers（UA、referer、cookies）的一些有趣反爬
https://blog.csdn.net/qq_33733970/article/details/77876761 爬虫之突破天涯防盗链
我要单独拿出来这段代码的想法就是，真的被这问题搞了好几天！若用src或者original直接得到的图片不是正在加载滚动，就是天涯社区的马赛克，URL显示的是403。直到看到上面最后一篇文章的解决方法。其实自己对前端的知识也不了解，参考网址仅记录在此日后学习查看。

        #指定文件路径 下载图片
        path = os.getcwd()
        new_path = os.path.join(path, 'pictures')
        if not os.path.isdir(new_path):
            os.mkdir(new_path)
        new_path += '/'
        #print(new_path)
        
        image_couter=1
        for img in content_image：
            img_url=img.get('original')
            #print (img_url)
            if img_url is not None:
                url_content = requests.get(img_url+line)								
                url_content.encoding = url_content.apparent_encoding
                img_name = '%s.jpg' % image_couter
            
                with open(os.path.join(img_name), 'wb') as f:
                    f.write(url_content.content)
                    f.close()
                    
                    #time.sleep(3)
                image_couter += 1
                
        print('下载图片完成')

调试阶段

利用单网页.py排查错误，意思就是每当大代码循环被终止的时候，把出错的网页URL放在下面代码的line中再运行并且运行成功后再去大代码跑循环。其实内容和主代码一致，只是这里line是固定的，主代码是循环读取的。

# -*- coding: utf-8 -*-
"""
Created on Thu Jun  6 16:52:50 2019

@author: sybil_yan
"""

import urllib.request
import re
import _locale
from bs4 import BeautifulSoup
import os
import time
import requests
from docx import Document
from docx.shared import Inches
from PIL import Image

_locale._getdefaultlocale = (lambda *args: ['zh_CN', 'utf8'])
line='http://bbs.tianya.cn/post-develop-102356-1.shtml'

link = urllib.request.urlopen(line)
html = link.read().decode('utf-8')
#print (html)

gettitle = re.compile(r'<title>(.*?)</title>')
title = re.findall(gettitle,html)
#print(title)
#获取文章标题

getmaxlength = re.compile(r'<a href=".*?">(\d*)</a>\s*<a href=".*?"\s*class=.*?>下页</a>')
#用正则获取最大页数信息
if getmaxlength.search(html):
    maxlength = getmaxlength.search(html).group(1)
    print(maxlength)
else:
    maxlength=1
    
#*************************************************************************
#正则匹配 除所有的帖子内容
gettext = re.compile(r'<div class="bbs-content">\s*([\S\s]*?)\s*</div>')  #[\S\s]匹配任意字符
gettext1 = re.compile(r'<div class="bbs-content clearfix">(.*?)</div>')
origin_soup=BeautifulSoup(html,'lxml')
content= origin_soup.find(class_="bbs-content clearfix")
content_image= origin_soup.findAll('img',)
#print(content)
#print(content_image)




#指定文件路径
path = os.getcwd()
new_path = os.path.join(path, 'pictures')
if not os.path.isdir(new_path):
    os.mkdir(new_path)
new_path += '/'
#print(new_path)
    
#下载图片
image_couter=1
#requests.DEFAULT_RETRIES = 5
for img in content_image:
    #s = requests.session()
    #s.keep_alive = False
    
    img_url=img.get('original')
    print (img_url)
    if img_url is not None:
        try:
            url_content = requests.get(img_url+line)# 可以
        except:
            print(u'HTTP请求失败！正在准备重发...')
            time.sleep(2)
            continue
    
        url_content.encoding = url_content.apparent_encoding
        img_name = '%s.jpg' % image_couter
    
        with open(os.path.join(img_name), 'wb') as f:
            f.write(url_content.content)
            f.close()
            #time.sleep(3)
        image_couter += 1
          
        print('下载图片完成')



#print(type(content))
#
#if content is None:
#    gettext1 = ''
#else:
#    gettext1 = content.text.replace('\t','').replace('\r','').replace('\n','')   
#
#    #print (content.text) 
#           
#    gettext1 = gettext1.split('\u3000\u3000')
    
#print(len(gettext1[1]))

getpagemsg = re.compile(r'<div class="atl-info">\s*<span>(.*?)<a href="http:.*uname="(.*)">.*\s*<span>时间：(.*?)</span>')
getnextpagelink = re.compile(r'<a href="(.*?)"\s*class.*?>下页</a>')
#遍历每一页,获取发帖作者，时间，内容，并打印

############################写入#############################
#filepath ='D:\\Ayu\\files\\'+title[0]+'.doc' #utf-8编码，需要转为gbk .decode('utf-8').encode('gbk')
#
#filehandle = open(filepath,'a+')
##打印文章标题
#filehandle.write(title[0] +'\n')
doc = Document()
doc.add_paragraph(title[0] +'\n')

str1 = ""
for pageno in range(1,int(maxlength)+1):
    i = 0
    
    str1 = '============================'+'第 '+ str(pageno)+' 页'+'=============================='
    print (str1)
    doc.add_paragraph(str1 +'\n')
    #获取每条发言的信息头，包含作者，时间
    #print(type(html))
    
    pagemsg = re.findall(getpagemsg,html)
    if  pageno is 1:
        #获取第一个帖子正文
        # html中的&nbsp,在转换成文档后,变成\t\n\r
        if content is None:
            gettext1 = ''
        else:
            gettext1 = content.text.replace('\t','').replace('\r','').replace('\n','')               
            
         #因为第一条内容由text1获取，text获取剩下的，所以text用i-1索引
    #获取帖子正文
    text = re.findall(gettext,html)
    for ones in pagemsg:
        if pageno > 1:
            if ones is pagemsg[0]:
                continue   #若不是第一页，跳过第一个日期
    
        if 'host' in ones[0]:
            str1= '楼主：'+ ones[1] +'     时间:' + ones[2]
            doc.add_paragraph(str1 +'\n')
        else:
            str1= ones[0] + ones[1] + '    时间:' + ones[2]
            doc.add_paragraph(str1+'\n')
            
            
        if  pageno is 1: #第一页特殊处理
            if i is 0:
                str1= gettext1.split('\u3000\u3000') 
                img_index=1
                for j in range(1,len(str1)):
                    #print(str1[j])
                    if len(str1[j]) == 0:  
                        images = '%s.jpg'%img_index
                        print('此处插入图片')
                        img_index = img_index+1
                        try:
                            doc.add_picture(images, width=Inches(4))     # 添加图, 设置宽度
                            os.remove(images)#删除保存在本地的图片
                        except Exception:
                            continue
                            
                    else:
                        doc.add_paragraph(str1[j])
                        print('此处插入段落')
            else:
                str1 = text[i-1].replace('<br>','\n').replace('title="点击图片查看幻灯模式"','\n')
                str0 = str1.split()
                print(str0)
                for m in range(0,len(str0)):
                    if str0[m].startswith('<img'):
                        images = '%s.jpg'%img_index
                        print('此处插入图片')
                        img_index = img_index+1
                        try:
                            doc.add_picture(images, width=Inches(4))     # 添加图, 设置宽度
                            os.remove(images)#删除保存在本地的图片
                        except Exception:
                            continue
                    else:
                        #print(str1)
                        doc.add_paragraph(str0[m] +'\n')
                        print('cichu插入段落')
                    
                
                
        else:  #非第一页的处理
            str1 = text[i].replace('<br>','\n').replace('title="点击图片查看幻灯模式"','\n')
            str2 = str1.split()
            print(str2)
            for k in range(0,len(str2)):
                if str2[k].startswith('<img'):
                    images = '%s.jpg'%img_index
                    print('此处插入图片')
                    img_index = img_index+1
                    try:
                        doc.add_picture(images, width=Inches(4))     # 添加图, 设置宽度
                        os.remove(images)#删除保存在本地的图片
                    except Exception:
                       continue
                else:
                    try:
                        #print(str1)
                        doc.add_paragraph(str2[k] +'\n')
                        print('插入段落')
                    except IndexError as e:
                        print ('error occured at >>pageno:'+str(pageno)+'   line:'+str(i))
                        print ('>>'+text[i-1])
                        print (e)
    

        i = i +1
    if pageno < int(maxlength):
        #获取帖子的下一页链接，读取页面内容
        nextpagelink = 'http://bbs.tianya.cn'+getnextpagelink.search(html).group(1)
        #print(type(html))
        link = urllib.request.urlopen(nextpagelink)
        html = link.read().decode('utf-8')


 
doc.save(title[0]+'.doc')

问题一：
<urlopen error [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。>
在这里插入图片描述
解决方法参考 https://blog.csdn.net/jingsiyu6588/article/details/88653946 。实际情况中隔了五分钟直接重运行就可以了，可能是天涯的网站监督不是那么严格。

问题二：
http://bbs.tianya.cn/post-develop-1385610-1.shtml （ConnectionError:HTTPConnectionPool(host=‘img7.laibafile.cn.youjz.org’, port=80):Max retries exceeded with url: /1.jpghttp://bbs.tianya.cn/post-develop-1385610-1.shtml%0A (Caused by NewConnectionError (’<urllib3.connection.HTTPConnection object at 0x000000000C3E9978>: Failed to establish a new
connection: [Errno 11004] getaddrinfo failed’))
第一次参考解决方法https://www.cnblogs.com/caicaihong/p/7495435.html 解决Max retries exceeded with url的问题，但我用了一次就不好用了（内心OS：难道真的要逼我学cookie和header么）。第二天我排查发现问题出在图片URL获取上，那么我参考了简书一位小哥的形式，当找不到地址时不断重播，发现是可行的OMG！代码如下：

                
                try:
                    url_content = requests.get(img_url+line)
                except:
                    print(u'HTTP请求失败！正在准备重发...')
                time.sleep(2)
                continue

未解决问题html记录：

http://bbs.tianya.cn/post-develop-2056252-1.shtml
http://bbs.tianya.cn/post-free-3229755-1.shtml
…
（ValueError : All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters，未查到原因空白页处置。如有网友知道问题出在哪希望告知在下，不胜感激）
http://bbs.tianya.cn/post-worldlook-595711-1.shtml （HTTPError: Bad Gateway HTTPError: Gateway Time-out）
后来发现所有含有worldlook和sock的地址都被封了无法访问（即网页进去都无法访问）。所以我们需要把所有不含此字节的挑出来：

# -*- coding: utf-8 -*-
"""
Created on Sat Jun 15 10:47:15 2019

@author: Administrator
"""

import _locale
from bs4 import BeautifulSoup

from docx import Document
from docx.shared import Inches
from PIL import Image


filepath0 ='errorWebsite.txt'
filehandle0 = open(filepath0,'w')

filepath1 ='readWebsite.txt'
filehandle1 = open(filepath1,'w')


for line in open("D:\\Ayu\\website0.txt","r",encoding='utf-8'):  

    if 'worldlook' in line or 'stocks' in line:
        filehandle0.write(line)
    else:
        filehandle1.write(line)

做了一周的成果啊 OAO：
在这里插入图片描述

撒花花 --<-<-<@

闫小玉

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
1
评论
Python爬虫初探——天涯

天涯论坛爬取全部博文，保存为word文档一切工作从目录开始每个文章的URL得到了图片下载出了一下小问题合理的创建标题，有助于目录的生成如何改变文本的样式插入链接与图片如何插入一段漂亮的代码片生成一个适合你的列表创建一个表格设定内容居中、居左、居右SmartyPants创建一个自定义列表如何创建一个注脚注释也是必不可少的KaTeX数学公式新的甘特图功能，丰富你的文章UML 图表FLowchart流程...
复制链接

扫一扫

专栏目录