爬虫针对动态网页处理:
- 对请求数据的url进行拼装
- 模拟浏览器对网页进行动态加载后得到源代码数据,再进行分析
对请求数据的url进行拼装
# -*- coding:utf-8 -*-
from scrapy.selector import Selector
from scrapy.spiders import Spider
from scrapy.http import Request,Response
import json
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
'''
cmd/shell:
scrapy crawl json
'''
class JsonSpider(Spider) :
name = 'json'
start_urls = [
'http://acm.swust.edu.cn/online/submit/'
]
'''
爬取Json
'''
def parse(self,response):
jsonstr=response.body_as_unicode()
print jsonstr
obj = json.loads(response.body_as_unicode())
print type(obj)
for attr in obj:
print ("%s: %s")%(attr,obj[attr])
模拟浏览器
安装pyqt4,beautifulsoup和spynner
下载中间件源码:
from scrapy.http import HtmlResponse
import spynner
import pyquery
import time
import BeautifulSoup
import chardet
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class DownloadWebkitMiddleware( object ):
def fixCharset(s,to_charset='utf-8'):
if s == None:
return ''
else:
try:
s=str(s)
charset=chardet.detect(s)['encoding']
return s.decode(charset,'ignore').encode(to_charset)
except TypeError:
return ''
def process_request( self, request, spider ):
browser = spynner.Browser()
browser.create_webview()
browser.set_html_parser(pyquery.PyQuery)
browser.load(request.url, 20)
try:
browser.wait_load(10)
except:
pass
return HtmlResponse( request.url, body=str(self.fixCharset(browser.html)) )