爬虫准备
1 装载虚拟环境
pip virtualenv
可以在任意位置创建虚拟环境
virtualenv 文件夹名
如果想指定python版本 -p python3.exe (路径)
2 更新pip
pip -- upgrade
3 获取响应头
response = urllib2.urlopen () 之后
response.info () 获取所有响应头
response
Fiidler
makecert. exe -r -ss my -n "CN=DO_NOT_TRUST_FiddlerRoot, O=DO_NOT_TRUST, OU=Created by http://www.fiddler2.com" -sky signature -eku 1.3 .6 .1 .5 .5 .7 .3 .1 -h 1 -cy authority -a sha1 -m 120 -b 09 /05 /2012
在cmd里面运行
爬虫爬取
一些知识
1 直接编译,在直接网址输入时引用(可看百度翻译)
from urllib import quote
if __name__ == '__main__' :
key = raw_input('请输入要翻译的内容:' )
key = quote(key)
2 方法编译,网址里面有=号,通过字典编译(可看爬取贴吧)
xq = { 'pn' :key, }
xq = urllib.urlencode(xq)
fullurl = base_url + xq
request = urllib2.Request(fullurl)
response = urllib2.urlopen(request)
3 post和get请求
post
data = urllib.urlencode(data)
requeset = urllib2.Request(base_url,data=data)
get不用
4 正则跨行 (加 re.s)
td_pattern = re.compile(r'<tr.*?</tr>' ,re.S)
常用方法
1
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
2 json转化成字典
json.loads(内容) 通常用于查看json里的内容
print json.dumps(data,indent=4 ,ensure_ascii=False )
基本
爬取一个网页
#coding:utf8
import urllib2
base_url =
request = urllib2.Request (base_url)
response = urllib2.urlopen(request ,timeout=30 )
print response .read()
查询爬取网页
import urllib
import urllib2
def search (key) :
base_url = 'http://www.baidu.com/s?'
qs = {
'wd' :key,
}
qs = urllib.urlencode(qs)
fullurl = base_url + qs
request = urllib2.Request(fullurl)
response = urllib2.urlopen(request)
return response.read()
if __name__ == '__main__' :
key = raw_input('请输入查询关键字:' )
result = search(key)
print result
爬取贴吧第几页到第几页
import urllib
import urllib2
import os
def search (key) :
base_url = 'http://tieba.baidu.com/f?kw=%C3%C0%C5%AE&'
xq = {
'pn' :key,
}
xq = urllib.urlencode(xq)
fullurl = base_url + xq
request = urllib2.Request(fullurl)
response = urllib2.urlopen(request)
return response.read()
if __name__ == '__main__' :
ys = raw_input(<