Python学习笔记之十（爬虫进阶2）

最新推荐文章于 2022-09-19 21:03:04 发布

xuanjat

最新推荐文章于 2022-09-19 21:03:04 发布

阅读量228

点赞数

分类专栏： Python学习笔记文章标签： python 学习

本文链接：https://blog.csdn.net/xuanjat/article/details/97006452

版权

Python学习笔记专栏收录该内容

11 篇文章 0 订阅

订阅专栏

Python学习笔记之十（爬虫进阶2）

2019-07-22 09:44:35 星期一

本课概要

作业讲解
抓包分析概述
使用Fiddler进行抓包分析
抓取HTTPS数据包
爬取腾讯视频的评论

抓包分析概述

所谓抓包分析，即将网络传输发送与接收的数据包进行抓取的操作，做爬虫时，数据并不一定就在HTML源码中，很可能隐藏在一些网址中，所以，我们要抓取某些数据，就需要进行抓包，分析出对应数据所隐藏在的网址，然后分析规律并爬取。

使用Fiddler进行抓包分析

利用Fiddler获取腾讯视频评论信息的url

略（证书认证失败）

微信爬虫

什么是微信爬虫

所谓微信爬虫，及自动获取微信的相关文章信息的一种爬虫。微信对我们的限制是很多的，所以，我们需要采取一些手段解决这些限制，主要包括伪装浏览器、使用代理IP等方式。

#微信爬虫实战
import re
import urllib.request
import time
import urllib.error
#自定义函数，功能是使用代理服务器爬一个网址
def use_proxy(proxy_addr,url):
    try:
        req=urllib.request.Request(url)
        req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36")
        proxy=urllib.request.ProxyHandler({'http':proxy_addr})
        opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        data=urllib.request.urlopen(req).read()
        return data
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
    except Exception as e:
        print("exception:"+str(e))
        #若exception异常，延时1s执行
        time.sleep(1)      
#设置关键词
key="python"
#设置代理服务器
proxy="127.0.0.1:8888"
#爬多少页信息
for i in range(0,5):
    key=urllib.request.quote(key)
    thispageurl="https://weixin.sogou.com/weixin?type=2&s_from=input&query="+key+"&page="+str(i)
    print(thispageurl)
    #a="http://blog.csdn.net"
    thispagedata=use_proxy(proxy,thispageurl)
    print(len(str(thispagedata)))
    pat1='<a href="(.*?)"'
    rs1=re.compile(pat1,re.S).findall(str(thispagedata))
    if(len(rs1)):
        print("此次("+str(i)+"页)没成功")
        continue
    for j in range(0,len(rs1)):
        thisurl=rs1[j]
        thisurl=thisurl.replace("amp;","")
        print(thisurl)
        file="F:/PL/weixinInf"+str(i)+"页第"+str(j)+"篇文章.html"
        thisdata=use_proxy(proxy,thisurl)
        try:
            fh=open(file,"wb")
            fh.write(thisdata)
            fh.close
            print("第"+str(i)+"页第"+str(j)+"篇文章成功")
        except Exception as e:
            print(e)
            print("第"+str(i)+"页第"+str(j)+"篇文章失败")

#csdn博客爬虫实战
import re
import urllib.request
import time
import urllib.error
#自定义函数，功能是使用代理服务器爬一个网址
def use_proxy(proxy_addr,url):
    try:
        req=urllib.request.Request(url)
        req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36")
        proxy=urllib.request.ProxyHandler({'http':proxy_addr})
        opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        data=urllib.request.urlopen(req).read()
        return data
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
    except Exception as e:
        print("exception:"+str(e))
        #若exception异常，延时1s执行
        time.sleep(1)      
#设置关键词
key="python"
#设置代理服务器
proxy="127.0.0.1:8888"
#爬多少页信息
for i in range(0,3):
    key=urllib.request.quote(key)
    #thispageurl="https://weixin.sogou.com/weixin?type=2&s_from=input&query="+key+"&page="+str(i)
    thispageurl="https://so.csdn.net/so/search/s.do?p="+str(i)+"&q="+key+"&t=blog&domain"
    print(thispageurl)
    #a="http://blog.csdn.net"
    thispagedata=use_proxy(proxy,thispageurl)
    print(len(str(thispagedata)))
    pat1='<a href="(.*?)"'
    rs1=re.compile(pat1,re.S).findall(str(thispagedata))
    if(len(rs1)==0):
        print("此次("+str(i)+"页)没成功")
        continue
    for j in range(0,len(rs1)):
        thisurl=rs1[j]
        #thisurl=thisurl.replace("amp;","")
        #print(thisurl)
        file="F:/PL/weixinInf/"+str(i)+"页第"+str(j)+"篇文章.html"
        thisdata=use_proxy(proxy,thisurl)
        try:
            fh=open(file,"wb")
            fh.write(thisdata)
            fh.close
            print("第"+str(i)+"页第"+str(j)+"篇文章成功")
        except Exception as e:
            print(e)
            print("第"+str(i)+"页第"+str(j)+"篇文章失败")

多线程爬虫

本课概要

什么是多线程爬虫
糗事百科段子爬虫
将糗事百科段子爬虫改造为多线程爬虫

什么是多线程爬虫

所谓多线程，即程序中的某些程序段并行执行，合理地设置多线程，可以让爬虫效率更高。

#多线程爬虫
#传统爬虫
import re
import urllib.request
import time
import urllib.error
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
#data=urllib.request.urlopen(req).read()
for i in range(1,4):
    url="https://www.qiushibaike.com/8hr/page/"+str(i)
    pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
    pat='<div class="recmd-right">.*?<a class="recmd-content" href=.*?>(.*?)</a>'
    #pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
    datalist=re.compile(pat,re.S).findall(pagedata)
    for j in range (0,len(datalist)):
        print("第"+str(i)+"页第"+str(j)+"个段子的内容是：")
        print(datalist[j])

#将传统爬虫改造为多线程爬虫
#线程入门
import threading
class A(threading.Thread):#建立一个线程
    def _init_(self): #默认参数self
        threading.Thread._init_(self)# 初始化线程
    def run(self):
        for i in range (0,10):
            print("我是线程A")
class B(threading.Thread):
    def _init_(self): 
        threading.Thread._init_(self)
    def run(self):
        for j in range (0,10):
            print("我是线程B")
#实例化线程
t1=A()
t1.start()
t2=B()
t2.start()

#将传统爬虫改造为多线程爬虫
import threading
import re
import urllib.request
import time
import urllib.error
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
#data=urllib.request.urlopen(req).read()
class One(threading.Thread):
    def _init_(self): #默认参数self
        threading.Thread._init_(self)# 初始化线程
    def run(self):
        for i in range (1,36,2):
            url="https://www.qiushibaike.com/8hr/page/"+str(i)
            pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
            pat='<div class="recmd-right">.*?<a class="recmd-content" href=.*?>(.*?)</a>'
            #pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
            datalist=re.compile(pat,re.S).findall(pagedata)
            for j in range (0,len(datalist)):
                print("第"+str(i)+"页第"+str(j)+"个段子的内容是：")
                print(datalist[j])
class Two(threading.Thread):
    def _init_(self): #默认参数self
        threading.Thread._init_(self)# 初始化线程
    def run(self):
        for i in range (0,36,2):
            url="https://www.qiushibaike.com/8hr/page/"+str(i)
            pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
            pat='<div class="recmd-right">.*?<a class="recmd-content" href=.*?>(.*?)</a>'
            #pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
            datalist=re.compile(pat,re.S).findall(pagedata)
            for j in range (0,len(datalist)):
                print("第"+str(i)+"页第"+str(j)+"个段子的内容是：")
                print(datalist[j])
one=One()
one.start()
two=Two()
two.start()

scrapy框架的安装

本课概要

什么是Scrapy框架
安装Scrapy框架及各种常见错误解决技巧
少坑版安装方式
常见错误与解决
作业

什么是Scrapy框架

Scrapy是一个Python爬虫框架，非常适合做一些大型爬虫项目，并且开发者利用这个框架，可以不用过多关注细节，我们会重点说这个框架。
Scrapy的官网地址是：http://scrapy.org/

少坑版安装方式

由于Scrapy框架涉及太多依赖库，在此，如果想省事的朋友，可以按照这种方式去安装。女解具体过程的朋友，可以先安装pip install scrapy，然后遇到错误参加后面的方式解决。
接下来讲解少坑安装方式。
0.开个VPN或者采用本地安装方式
1.首先，升级pip：
python-m pip install–upgrade pip
2、安装Visual Studio 2015专业版，去http://download.microsoft.com/download/B/8/9/B898E46E-CBAE-4045-A8E2-
2D33DD36F3C4/vs2015.pro_chs.iso下载
3、安装lxml
http://www.lfd.uci.edu/~gohlke/pythonlibs/
找"lxml-"、pip install wheel、pip install lxml-3.6.4-cp35-cp35m-win amd64.whl
4、pip install scrapy或pip install scrapy==1.1.0rc3

安装成功

xuanjat

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python学习笔记之十（爬虫进阶2）

Python学习笔记之十（爬虫进阶2）2019-07-22 09:44:35 星期一本课概要作业讲解抓包分析概述使用Fiddler进行抓包分析抓取HTTPS数据包爬取腾讯视频的评论抓包分析概述所谓抓包分析，即将网络传输发送与接收的数据包进行抓取的操作，做爬虫时，数据并不一定就在HTML源码中，很可能隐藏在一些网址中，所以，我们要抓取某些数据，就需要进行抓包，分析出对应数据所隐...
复制链接

扫一扫

专栏目录