1.Js Ajax:
Ajax的原理简单来说通过浏览器的javascript对象XMLHttpRequest(Ajax引擎)对象向服务器发送异步请求并接收服务器的响应数据,然后用javascript来操作DOM而更新页面。这其中最关键的一步就是从服务器获得请求数据。即用户的请求间接通过Ajax引擎发出而不是通过浏览器直接发出,同时Ajax引擎也接收服务器返回响应的数据,所以不会导致浏览器上的页面全部刷新。
1.创建对象
if(window.XMLHttpRequest){
xmlHttp = new XMLHttpRequest();
if(xmlHttp.overrideMimeType){
xmlHttp.overrideMimeType("text/xml");
}
}else if(window.ActiveXobject){
var activeName =["MSXML2.XMLHTTP","Microsoft.XMLHTTP"];
for(var i=0; i<activeName.length; i++){
try{
xmlHttp = new ActiveXobject(activeName[i]);
break;
}catch(e){
}
}
}
设置回调函数:
xmlHttp.onreadystatechange= callback;
function
callbac
3.使用OPEN方法与服务器建立连接 xmlHttp.open("get","ajax?name="+ name,true)
此步注意设置http的请求方式(post/get),如果是POST方式,注意设置请求头信息xmlHttp.setRequestHeader("Content-Type","application/x-www-form-urlencoded")
4.向服务器端发送数据
xmlHttp.send(
null
);
5.在回调函数中针对不同的响应状态进行处理
if
(xmlHttp.readyState == 4){
//判断交互是否成功
if
(xmlHttp.status == 200){
//获取服务器返回的数据
//获取纯文本数据
var
responseText =xmlHttp.responseText;
document.getElementById(
"info"
).innerHTML = responseText;
}
}
Ajax分析方法
chrome-》network-》查找XHRRequest
/ajax 发给后台的request,
1.分析请求
2.分析响应
from urllib.parse import urlencode
import requests
from pyquery import PyQuery as pq
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers={
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'Use-Agent': 'Mozilla/5.0 (Macintosh;Intel Mac OS X 10_12_3 ) AppleWebKit/537.36(KHTML,like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params={
'type' : 'uid',
'value':'2830678474',
'containerid' :'1076032830678474',
'page':page
}
url=base_url+urlencode(params)
print(url)
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.json()
except requests.ConnectionError as e:
print('Error',e.args)
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item =item.get('mblog')
weibo = {}
weibo['id']=item.get('id')
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments']=item.get('comments_count')
weibo['reposts']=item.get('reposts_count')
yield weibo
if __name__ =='__main__':
for page in range(1,11):
json=get_page(page)
print(json)
results=parse_page(json)
for result in results:
print(result)
保存在bongoDB:
from pymongo import MongoClient
client =MongoClient()
db=client['weibo']
collection=db['weibo']
def save_to_mongo(result):
if collection.insert(result):
print(save to mongo)
3.分析Ajax 爬取今日头条街景
from urllib.parse import urlencode import requests import os from hashlib import md5 import time from multiprocessing.pool import Pool //读取request base_url = 'https://www.toutiao.com/api/search/content/?' headers={ 'Host': 'www.toutiao.com', # 'Referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D', 'Use-Agent': 'Mozilla/5.0 (Macintosh;Intel Mac OS X 10_12_3 ) AppleWebKit/537.36(KHTML,like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } def get_page(offset,timestamp): params={ 'aid':'24', 'app_name':'web_search', 'offset': offset, 'format':'json', 'keyword':'街拍', 'autoload': 'true', 'count':'20', 'en_qc': '1', 'cur_tab': '1', 'from':'search_tab', 'pd ':'synthesis', 'timestamp':timestamp } url=base_url+urlencode(params) print(url) try: response=requests.get(url,headers=headers) if response.status_code==200: return response.json() else: print(response.status_code) except requests.ConnectionError as e: print('Error',e.args) //获取图片地址 def get_images(json): if json.get('data'): for item in json.get('data'): print(item) title = item.get('title') print(title) images=item.get('image_list') for image in images: yield{ 'image':image.get('url'), 'title':title } //保存图片 def save_images(item): if not os.path.exists(item.get('title')): os.mkdir(item.get('title')) try: response = requests.get(item.get('image')) if response.status_code == 200: file_path ='{0}/{1}/{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg') if not os.path.exists(file_path): with open(file_path,'wb') as f : f.write(response.content) else: print('Already Download',file_path) except requests.ConnectionError as e: print('Error', e.args) def main(offset): t=time.time() json=get_page(offset,int(round(t * 1000))) print(json) for item in get_images(json): print(item) save_images(item) //多线程 if __name__ =='__main__': pool=Pool() groups=([x*20 for x in range(0,11)]) pool.map(main,groups) pool.close() pool.join()