python 爬虫总结(一)

最新推荐文章于 2021-06-21 18:22:55 发布

黄大芬

最新推荐文章于 2021-06-21 18:22:55 发布

阅读量470

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/funfun0/article/details/48240657

版权

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

__author__ = 'fen'
# coding=utf8
import urllib2
import urllib
from StringIO import StringIO
import bs4
def base1(url):
    content=urllib2.urlopen(url).read
    return content
def agent(url):
    proxy_support=urllib2.ProxyHandler({'http':url})  #通过代理重定向请求
    opener=urllib2.build_opener(proxy_support,urllib2.HTTPHandler)
    urllib2.install_opener(opener)
    content=urllib2.urlopen(url).read()

    #添加头信息，模仿浏览器抓取网页，对付返回403禁止访问的问题
    i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
    req = urllib2.Request(url,headers=i_headers)
    html = urllib2.urlopen(req)
    if url == html.geturl():
        html = html.read()
        return html

    return content
def para1(url,page):
    import requests     # 用'?'的post
    header_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:37.0) Gecko/20100101 Firefox/37.0' #头信息可以修改成多种
    headers = {'User-Agent':header_agent}
    #某些网站反感爬虫的到访，于是对爬虫一律拒绝请求。这时候我们需要伪装成浏览器，这可以通过修改http包中的header
    pqyload={'curr_Page':page}  #  curr_Page,condition.pageNo ?后面连接的参数
    r=requests.get(url,params=pqyload)
    html=requests.get(url,headers = headers,params=pqyload).text
    return html

def para2(url,page,rn1,rn2):
    #url='http://gsxt.hnaic.gov.cn/notice/search/ent_except_list'
    header_agent = "Mozilla/5."+str(rn1)+"(X11; Ubuntu; Linux x86_32; rv:37.0) Gecko/20100101 Firefox/36."+str(rn2)
    headers = {'User-Agent':header_agent}
    #将header信息随机生成,以免访问受阻
    values={
        'random':'1440940998226',
        'cxyzm':'no',
        'page.currentPageNo':str(page),
        }     # &的情况
    data=urllib.urlencode(values)  #进行参数封装
    req=urllib2.Request(url,data,headers=headers)
    req.add_header('Accept-encoding', 'gzip')

    response = urllib2.urlopen(req)
    html =StringIO(response.read())  #源码有可能被压缩 通过这个可以看到更真实的源码

    html=bs4.BeautifulSoup(html)  #bs 自动编码,也可以通过一下获取源代码编码方式,下面的方法较慢
    # charset=chardet.detect(html)
    # code=charset['encoding']#获取源代码的编码方式
    # text=str(html).decode(code,'ignore').encode('utf-8')

    html=str(html) #用到bs包的 先将强制转换成str

    return  html

#print agent('http://gsxt.ngsh.gov.cn/ECPS/enterpriseAbnAction_enterpriseList.action?curr_Page=2')
#print para1('http://gsxt.ngsh.gov.cn/ECPS/enterpriseAbnAction_enterpriseList.action',2)
#print para2(url='http://gsxt.hnaic.gov.cn/notice/search/ent_except_list',page=2,rn1=3,rn2=2)

黄大芬

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 爬虫总结(一)

__author__ = 'fen'# coding=utf8import urllib2import urllibfrom StringIO import StringIOimport bs4def base1(url): content=urllib2.urlopen(url).read return contentdef agent(url): prox
复制链接

扫一扫