arouthgh gae provider very good api for debug & use in product envroment crawling program,but it could not conveicent downlad extract data to local.
forunately,there has another choice 80legs,which provider a cash plan also support java & python wrap,this is a good news to me.
when i use gae's urlfetch api replace original urllib2,urllib,there has some difficutity in store extract data to bigtable,
you should convert the result to unicode to store,this is very important
e.g. ccc=Counter(html=db.text(result.content,"utf-8"))
ccc.put()
aaa=Counter(html=db.text(reuslt.content,"gb18030"))#based on fetch result's charset encoding
#!/usr/bin/env python
#coding=utf8
from google.appengine.api.labs import taskqueue
from google.appengine.ext import db
from google.appengine.ext import webapp
from google.appengine.ext.webapp import template
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.api import urlfetch
from google.appengine.api import memcache
class Counter(db.Model):
count = db.IntegerProperty(indexed=False)
html=db.TextProperty()
url=db.LinkProperty()
isbn=db.StringProperty()
class CounterHandler(webapp.RequestHandler):
def get(self):
key = self.request.get('isbn')
# Add the task to the default queue.
taskqueue.add(url='/worker', params={'isbn': key},method='GET')
print "done!!!!!!!!!!!!"
# self.redirect('/')
class CounterWorker(webapp.RequestHandler):
def get(self): # should run at most 1/s
print "do-------------------------"
isbn=self.request.get("isbn").encode("utf8")
rpc = urlfetch.create_rpc()
header={}
# header["User-Agent"]="Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6"
# header["Accept"]="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
# header["Accept-Language"]="en-us,en;q=0.5"
# header["Accept-Charset"]="ISO-8859-1,utf-8;q=0.7,*;q=0.7"
# urlfetch.make_fetch_call(rpc, "http://www.amazon.cn/",method="GET")
urlfetch.make_fetch_call(rpc, "http://www.dangdang.com/",method="GET")
# ... do other things ...
# try:
result = rpc.get_result()
print result.status_code
if result.status_code == 200:
# print "hhhha"
## print result
# print u"上下午"
# print "-------------"
# print "中国"
# text = result.content
# memcache.add("key2s", text, 600)
#
# print type(text)
# print text
## text="中国"
## print text
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
# ccc=Counter(html=db.Text(text,'utf-8'),url="http://www.dangdang.com/",isbn=isbn)
ccc=Counter(html=db.Text(result.content,"gb18030"),url="http://www.ng.com/",isbn="bbbbbbbbbbbbbb")
ccc.put()
# ...
# except urlfetch.DownloadError:
# # Request timed out or failed.
# # ...
# pass
# print isbn
# def txn():
# counter = Counter.get_by_key_name(key)
# if counter is None:
# counter = Counter(key_name=key, count=1)
# else:
# counter.count += 1
# counter.put()
# db.run_in_transaction(txn)
print "done"
def main():
run_wsgi_app(webapp.WSGIApplication([
('/', CounterHandler),
('/worker', CounterWorker),
]))
if __name__ == '__main__':
main()
forunately,there has another choice 80legs,which provider a cash plan also support java & python wrap,this is a good news to me.
when i use gae's urlfetch api replace original urllib2,urllib,there has some difficutity in store extract data to bigtable,
you should convert the result to unicode to store,this is very important
e.g. ccc=Counter(html=db.text(result.content,"utf-8"))
ccc.put()
aaa=Counter(html=db.text(reuslt.content,"gb18030"))#based on fetch result's charset encoding
#!/usr/bin/env python
#coding=utf8
from google.appengine.api.labs import taskqueue
from google.appengine.ext import db
from google.appengine.ext import webapp
from google.appengine.ext.webapp import template
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.api import urlfetch
from google.appengine.api import memcache
class Counter(db.Model):
count = db.IntegerProperty(indexed=False)
html=db.TextProperty()
url=db.LinkProperty()
isbn=db.StringProperty()
class CounterHandler(webapp.RequestHandler):
def get(self):
key = self.request.get('isbn')
# Add the task to the default queue.
taskqueue.add(url='/worker', params={'isbn': key},method='GET')
print "done!!!!!!!!!!!!"
# self.redirect('/')
class CounterWorker(webapp.RequestHandler):
def get(self): # should run at most 1/s
print "do-------------------------"
isbn=self.request.get("isbn").encode("utf8")
rpc = urlfetch.create_rpc()
header={}
# header["User-Agent"]="Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6"
# header["Accept"]="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
# header["Accept-Language"]="en-us,en;q=0.5"
# header["Accept-Charset"]="ISO-8859-1,utf-8;q=0.7,*;q=0.7"
# urlfetch.make_fetch_call(rpc, "http://www.amazon.cn/",method="GET")
urlfetch.make_fetch_call(rpc, "http://www.dangdang.com/",method="GET")
# ... do other things ...
# try:
result = rpc.get_result()
print result.status_code
if result.status_code == 200:
# print "hhhha"
## print result
# print u"上下午"
# print "-------------"
# print "中国"
# text = result.content
# memcache.add("key2s", text, 600)
#
# print type(text)
# print text
## text="中国"
## print text
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
# ccc=Counter(html=db.Text(text,'utf-8'),url="http://www.dangdang.com/",isbn=isbn)
ccc=Counter(html=db.Text(result.content,"gb18030"),url="http://www.ng.com/",isbn="bbbbbbbbbbbbbb")
ccc.put()
# ...
# except urlfetch.DownloadError:
# # Request timed out or failed.
# # ...
# pass
# print isbn
# def txn():
# counter = Counter.get_by_key_name(key)
# if counter is None:
# counter = Counter(key_name=key, count=1)
# else:
# counter.count += 1
# counter.put()
# db.run_in_transaction(txn)
print "done"
def main():
run_wsgi_app(webapp.WSGIApplication([
('/', CounterHandler),
('/worker', CounterWorker),
]))
if __name__ == '__main__':
main()