Python爬虫

最新推荐文章于 2024-08-06 11:55:39 发布

zz198808

最新推荐文章于 2024-08-06 11:55:39 发布

阅读量748

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/zz198808/article/details/9260731

版权

Python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

1.简单的获得页面内容

import urllib2
content = urllib2.urlopen('http://www.hao123.com').read()
f=open("1.html",'w');
f.write(content)
f.close()

但是这样存在一些站点采取了保护，因此要伪装成浏览器的正常请求，

# -*- coding: cp936 -*-
import urllib, urllib2, cookielib

class Dawn:
    '''这是一个访问浏览器的方法，目前只是写着玩，已经一年没有写Python，该忘的都忘了吧'''
    timeout = 30

    def __init__(self):
        '''初始化模块,增加cookie支持'''
        httpHandler = urllib2.HTTPHandler()
        httpsHandler = urllib2.HTTPSHandler()
        cookie = cookielib.CookieJar()
        cookie_support = urllib2.HTTPCookieProcessor(cookie)
        opener = urllib2.build_opener(cookie_support, httpHandler, httpsHandler)
        urllib2.install_opener(opener)

    def getHeader(self):
        '''返回浏览器header'''
        header = {
            "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13",
            #"User-Agent" = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13",
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language":"zh-cn,zh;q=0.5",
            #"Accept-Encoding":"gzip,deflate",
            "Accept-Charset":"GB2312,utf-8;q=0.7,*;q=0.7",
            "Keep-Alive":"115",
            "Connection":"keep-alive"
            }
        return header

    def request(self, url, headers=None, data = None):
        '''请求处理'''
        if headers is None:
            header = self.getHeader()

        #开始设置请求数据
        req = urllib2.Request(
            url = url,
            headers = header
            )
        if data is not None:
            data = urllib.urlencode(data)

        #请求开始
        try:
            request = urllib2.urlopen(req, data, self.timeout)
            source = request.read()
            request.close()
        except:
            source = None
            #print "connect faild..."
            
        return source

if __name__ == "__main__":
    dawn = Dawn()
    urls=["www.baidu.com","www.163.com","oschina.net","www.sina.com"]
    
    for item in urls:
        url="http://"+item
        fileName=item+".html"
        content=dawn.request(url)
        f=open(fileName,"w")
        f.write(content)
        f.close()