获取url信息、抓取网络数据
#!/usr/bin/python
#coding:utf-8
import urllib
import urllib2
import re
print "--------获取url基本信息-----------"
response = urllib.urlopen("https://www.baidu.com/index.php?tn=87048150_dg&ch=1")
print response.getcode()
print "--------------"
print response.geturl()
print "--------------"
print response.info()
print "--------------"
print response.headers
print "--------------"
print response.read()
输出如下:
E:\python\python_jdk\python.exe E:/python/py_pro/safly/Python_Demo.py
--------获取url基本信息-----------
200
--------------
https://www.baidu.com/index.php?tn=87048150_dg&ch=1
--------------
Accept-Ranges: bytes
Cache-Control: no-cache
Content-Length: 227
Content-Type: text/html
Date: Wed, 27 Sep 2017 00:33:08 GMT
Last-Modified: Wed, 20 Sep 2017 09:59:00 GMT
P3p: CP=" OTI DSP COR IVA OUR IND COM "
P3p: CP=" OTI DSP COR IVA OUR IND COM "
Pragma: no-cache
Server: BWS/1.1
Set-Cookie: BD_NOT_HTTPS=1; path=/; Max-Age=300
Set-Cookie: BIDUPSID=D8FF89355A71DD69C09398D276CF0CEB; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
Set-Cookie: PSTM=1506472388; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
Set-Cookie: BDRCVFR[x4e6higC8W6]=aeXf-1x8UdYcs; path=/; domain=.baidu.com
Strict-Transport-Security: max-age=0
X-Ua-Compatible: IE=Edge,chrome=1
--------------
Accept-Ranges: bytes
Cache-Control: no-cache
Content-Length: 227
Content-Type: text/html
Date: Wed, 27 Sep 2017 00:33:08 GMT
Last-Modified: Wed, 20 Sep 2017 09:59:00 GMT
P3p: CP=" OTI DSP COR IVA OUR IND COM "
P3p: CP=" OTI DSP COR IVA OUR IND COM "
Pragma: no-cache
Server: BWS/1.1
Set-Cookie: BD_NOT_HTTPS=1; path=/; Max-Age=300
Set-Cookie: BIDUPSID=D8FF89355A71DD69C09398D276CF0CEB; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
Set-Cookie: PSTM=1506472388; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
Set-Cookie: BDRCVFR[x4e6higC8W6]=aeXf-1x8UdYcs; path=/; domain=.baidu.com
Strict-Transport-Security: max-age=0
X-Ua-Compatible: IE=Edge,chrome=1
--------------
<html>
<head>
<script>
location.replace(location.href.replace("https://","http://"));
</script>
</head>
<body>
<noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
</body>
</html>
获取图片
print "--------通过url获取图片-----------"
pic = "https://b-ssl.duitang.com/uploads/item/201407/10/20140710183824_dnwws.jpeg"
print urllib.urlretrieve(pic,filename="d://google.jpeg")
print "--------通过正则获取图片-----------"
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImag(html):
imglist = re.findall(r'src="(.*?\.(jpg|png))"', html)
print imglist
html = getHtml("http://www.douyu.com/directory/game/LOL")
getImag(html)
print "------------urlencode-----------"
baseUrl = "http://zzk.cnblogs.com/s?"
connedUrl = urllib.urlencode({"t":"b","w":"python"})
print connedUrl
finalUrl = baseUrl +connedUrl
print finalUrl
输出如下:
--------通过url获取图片-----------
('d://google.jpeg', <httplib.HTTPMessage instance at 0x0000000002A9C288>)
--------通过正则获取图片-----------
[('https://staticlive.douyucdn.cn/upload/game_cate/785591e9ef77cc9480c0cfce22848737.png', 'png'), ('https://cs-op.douyucdn.cn/dypart/2017/09/26/ba0c06e8004e425bb2fd8e5e16667c74.jpg', 'jpg'), ('https://apic.douyucdn.cn/upload/avanew/face/201709/05/20/e244cf1323bea91e12b302eed4fba497_middle.jpg', 'jpg'), ('https://shark.douyucdn.cn//app/douyu/re
。。。。。。
省略一些图片地址。。。
urlencode、get、post方法
print "------------urlencode-----------"
baseUrl = "http://zzk.cnblogs.com/s?"
connedUrl = urllib.urlencode({"t":"b","w":"python"})
print connedUrl
finalUrl = baseUrl +connedUrl
print finalUrl
print "------------get方法---------------"
#GET方法
#https://www.baidu.com/index.php?tn=87048150_dg&ch=1
baiduConnected = {"tn":"87048150_dg","ch":"1"}
boPage = urllib.urlopen("https://www.baidu.com/index.php?%s" %baiduConnected)
print boPage.read()
print "------------post方法---------------"
parmas = urllib.urlencode({'spam':1,'eggs':2,'bacon':0})
f=urllib.urlopen("http://python.org/query",parmas)
print f.read()
输出如下:
------------urlencode-----------
t=b&w=python
http://zzk.cnblogs.com/s?t=b&w=python
------------get方法---------------
<html>
<head>
<script>
location.replace(location.href.replace("https://","http://"));
</script>
</head>
<body>
<noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
</body>
</html>
------------post方法---------------
<!doctype html>
<!--[if lt IE 7]> <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9"> <![endif]-->
<!--[if IE 7]> <html class="no-js ie7 lt-ie8 lt-ie9"> <![endif]-->
<!--[if IE 8]> <html class="no-js ie8 lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en" dir="ltr"> <!--<![endif]-->
。。。。。。
省略一些代码。。。。。
构造headers
print "----------构造headers----------"
#抓取网页内容-发送报头-1
url= "https://www.baidu.com/index.php?tn=87048150_dg&ch=1"
send_headers = {
'Host':'www.baidu.com',
'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection':'keep-alive'
}
req = urllib2.Request(url,headers=send_headers)
responsee = urllib2.urlopen(req)
print responsee.read()
输出如下:
----------构造headers----------
<!DOCTYPE html>
<!--STATUS OK-->
<html>
<head>
<meta http-equiv="content-type" content="text/html;charset=utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=Edge">
<meta content="always" name="referrer">
<meta name="theme-color" content="#2932e1">
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" />
<link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="百度搜索" />
<link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu.svg">
<link rel="dns-prefetch" href="//s1.bdstatic.com"/>
<link rel="dns-prefetch" href="//t1.baidu.com"/>
<link rel="dns-prefetch" href="//t2.baidu.com"/>
<link rel="dns-prefetch" href="//t3.baidu.com"/>
<link rel="dns-prefetch" href="//t10.baidu.com"/>
<link rel="dns-prefetch" href="//t11.baidu.com"/>
<link rel="dns-prefetch" href="//t12.baidu.com"/>
<link rel="dns-prefetch" href="//b1.bdstatic.com"/>
<title>百度一下,你就知道</title>
<style id="css_index" index="index" type="text/css">html,body{height:100%}
。。。。。。。。
省略一些代码
。。。。。。。。