Python 爬虫学习

最新推荐文章于 2024-11-02 12:07:54 发布

aipei5098

最新推荐文章于 2024-11-02 12:07:54 发布

阅读量92

点赞数

文章标签：爬虫 python php

原文链接：http://www.cnblogs.com/BloodZero/p/4648891.html

版权

#coding:utf-8
#author:Blood_Zero

'''
    1、获取网页信息
    2、解决编码问题，通过charset库(默认不安装这个库文件)
'''
import urllib
import urllib2

url = "http://192.168.1.135/myself/"
html = urllib.urlopen(url)
content = html.read()
print content
#如果网页中存在其他编码，就会出现乱码
#print content.decode('gbk').encode('utf-8')


'''
    简易获取网页信息
'''
#获取当前url
print "当前URL："+str(html.geturl())

#网页状态码
print "当前状态码："+str(html.code)
#print "当前状态码："+str(html.getcode())

#网站头信息
print "当前头信息：\n"+str(html.headers)
#print "当前头信息：\n"+str(html.info())

#获取网站编码
print "当前网站使用编码："+str(html.info().getparam("charset"))

#下载网页源码
urllib.urlretrieve(url,"E:\\Python_Code\\pyTools\\url.txt")


'''
    模拟浏览器访问网址
'''
#方法一
req=urllib2.Request(url)
# 添加头信息
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.2; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0")
req.add_header("Get",url)
req.add_header("Host","192.168.1.135")

new_html = urllib2.urlopen(req)
print new_html.read()
print req.headers.items()

#方法二
myheader={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
    "Host":"192.168.1.135",
    "Get":url
}
req1 = urllib2.Request(url,headers=myheader)
new_html_1 = urllib2.urlopen(req1)
print new_html_1.read()
print req1.headers.items()


'''
    在网页中查询指定文件
'''
def get_content(url):
    html = urllib.urlopen(url)
    content = html.read()
    html.close()
    return content

def get_file(self):
    #匹配php文件
    regex = r'a href=(.+?\.php)'
    pat=re.compile(regex)

    file_code = re.findall(pat,self)
    print str(file_code)+"\n"

info = get_content("http://192.168.1.135/myself/SQL_Injection/")
get_file(info)