一、普通抓取(这里演示的是抓取百度首页的相关代码,包括css和js)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
def clear():
'''该函数用于清屏 '''
print(u'内容较多,显示3秒后翻页')
time.sleep(3)
OS = platform.system()
if (OS == u'Windows'):
os.system('cls')
else:
os.system('clear')
def linkBaidu():
url = 'http://www.baidu.com'
try:
response = urllib2.urlopen(url, timeout=3)
except urllib2.URLError:
print(u"网络地址错误")
exit()
with open('./baidu.txt', 'w') as fp:
fp.write(response.read())
print(u"获取url信息,response.geturl() \n: %s" % response.geturl())
print(u"获取返回代码,response.getcode() \n: %s" % response.getcode())
print(u"获取返回信息,response.info() \n: %s" % response.info())
print(u"获取的网页内容已存入当前目录的baidu.txt中,请自行查看")
if __name__ == '__main__':
linkBaidu()
二、通过代理(免费proxy)爬取,与第一点相比的好处就是不太容易被封住,或者说封住后可以换)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import sys
import re
def testArgument():
'''测试输入参数,只需要一个参数 '''
if len(sys.argv) != 2:
print(u'只需要一个参数就够了')
tipUse()
exit()
else:
TP = TestProxy(sys.argv[1])
def tipUse():
'''显示提示信息 '''
print(u'该程序只能输入一个参数,这个参数必须是一个可用的proxy')
# 一下两句是命令行中的用法案例,先是python然后是文件名最后是代理ip地址
print(u'usage: python testUrllib2WithProxy.py http://1.2.3.4:5')
print(u'usage: python testUrllib2WithProxy.py https://1.2.3.4:5')
class TestProxy(object):
'''这个类的作用是测试proxy是否有效 '''
def __init__(self, proxy):
self.proxy = proxy
self.checkProxyFormat(self.proxy)
self.url = 'http://www.baidu.com'
self.timeout = 5
self.flagWord = '百度' # 在网页返回的数据中查找这个关键词
self.useProxy(self.proxy)
def checkProxyFormat(self, proxy):
try:
proxyMatch = re.compile('http[s]?://[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}:[\d]{1,5}$')
re.search(proxyMatch, proxy).group()
except AttributeError:
tipUse()
exit()
flag = 1
proxy = proxy.replace('//', '')
try:
protocol = proxy.split(':')[0]
ip = proxy.split(':')[1]
port = proxy.split(':')[2]
except IndexError:
print(u'下标出界')
tipUse()
exit()
flag = flag and len(proxy.split(':')) == 3 and len(ip.split('.')) == 4
flag = ip.split('.')[0] in map(str, xrange(1, 256)) and flag
flag = ip.split('.')[1] in map(str, xrange(256)) and flag
flag = ip.split('.')[2] in map(str, xrange(256)) and flag
flag = ip.split('.')[3] in map(str, xrange(1, 255)) and flag
flag = protocol in [u'http', u'https'] and flag
flag = port in map(str, range(1, 65535)) and flag
'''这里是在检查proxy的格式 '''
if flag:
print(u'输入的http代理服务器符合标准')
else:
tipUse()
exit()
def useProxy(self, proxy):
'''利用代理访问百度,并查找关键词 '''
protocol = proxy.split('//')[0].replace(':', '')
ip = proxy.split('//')[1]
opener = urllib2.build_opener(urllib2.ProxyHandler({protocol: ip}))
urllib2.install_opener(opener)
try:
response = urllib2.urlopen(self.url, timeout=self.timeout)
except:
print(u'连接错误,退出程序')
exit()
str = response.read()
if re.search(self.flagWord, str):
print(u'已取得特征词,该代理可用')
else:
print(u'该代理不可用')
if __name__ == '__main__':
testArgument()
三、用的网站会验证你是机器人还是人,所以第三种方法是赋予爬虫一个浏览器的header来冒充人
先给出常用header的python包:userAgents
#!/usr/bin/env python
#-*- coding: utf-8 -*-
pcUserAgent = {
"safari 5.1 – MAC":"User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"safari 5.1 – Windows":"User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"IE 9.0":"User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"IE 8.0":"User-Agent:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"IE 7.0":"User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"IE 6.0":"User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Firefox 4.0.1 – MAC":"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Firefox 4.0.1 – Windows":"User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera 11.11 – MAC":"User-Agent:Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera 11.11 – Windows":"User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Chrome 17.0 – MAC":"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Maxthon":"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Tencent TT":"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"The World 2.x":"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"The World 3.x":"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"sogou 1.x":"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"360":"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Avant":"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Green Browser":"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"
}
mobileUserAgent = {
"iOS 4.33 – iPhone":"User-Agent:Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"iOS 4.33 – iPod Touch":"User-Agent:Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"iOS 4.33 – iPad":"User-Agent:Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Android N1":"User-Agent: Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Android QQ":"User-Agent: MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Android Opera ":"User-Agent: Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Android Pad Moto Xoom":"User-Agent: Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"BlackBerry":"User-Agent: Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"WebOS HP Touchpad":"User-Agent: Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Nokia N97":"User-Agent: Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Windows Phone Mango":"User-Agent: Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UC":"User-Agent: UCWEB7.0.2.37/28/999",
"UC standard":"User-Agent: NOKIA5700/ UCWEB7.0.2.37/28/999",
"UCOpenwave":"User-Agent: Openwave/ UCWEB7.0.2.37/28/999",
"UC Opera":"User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
}
然后是修改header版爬虫
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import userAgents
'''userAgents.py是个自定义的模块,位置处于当前目录下 '''
class Urllib2ModifyHeader(object):
'''使用urllib2模块修改header '''
def __init__(self):
# 这个是PC + IE 的User-Agent
PIUA = userAgents.pcUserAgent.get('IE 9.0')
# 这个是Mobile + UC的User-Agent
MUUA = userAgents.mobileUserAgent.get('UC standard')
# 测试用的网站选择的是有道翻译
self.url = 'http://fanyi.youdao.com'
self.useUserAgent(PIUA, 1)
self.useUserAgent(MUUA, 2)
def useUserAgent(self, userAgent, name):
request = urllib2.Request(self.url)
request.add_header(userAgent.split(':')[0], userAgent.split(':')[1])
response = urllib2.urlopen(request)
fileName = str(name) + '.html'
with open(fileName, 'a') as fp:
fp.write("%s\n\n" % userAgent)
fp.write(response.read())
if __name__ == '__main__':
umh = Urllib2ModifyHeader()
资料摘自 python网络爬虫实战