use gae as crawler

最新推荐文章于 2021-02-23 13:57:43 发布

mlzboy

最新推荐文章于 2021-02-23 13:57:43 发布

阅读量149

点赞数

文章标签： GAE EXT Google Python Windows

arouthgh gae provider very good api for debug & use in product envroment crawling program,but it could not conveicent downlad extract data to local.
forunately,there has another choice 80legs,which provider a cash plan also support java & python wrap,this is a good news to me.

when i use gae's urlfetch api replace original urllib2,urllib,there has some difficutity in store extract data to bigtable,
you should convert the result to unicode to store,this is very important
e.g. ccc=Counter(html=db.text(result.content,"utf-8"))
ccc.put()

aaa=Counter(html=db.text(reuslt.content,"gb18030"))#based on fetch result's charset encoding

#!/usr/bin/env python
#coding=utf8
from google.appengine.api.labs import taskqueue
from google.appengine.ext import db
from google.appengine.ext import webapp
from google.appengine.ext.webapp import template
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.api import urlfetch
from google.appengine.api import memcache
class Counter(db.Model):
    count = db.IntegerProperty(indexed=False)
    html=db.TextProperty()
    url=db.LinkProperty()
    isbn=db.StringProperty()


class CounterHandler(webapp.RequestHandler):
    def get(self):
        key = self.request.get('isbn')
        # Add the task to the default queue.
        taskqueue.add(url='/worker', params={'isbn': key},method='GET')
        print "done!!!!!!!!!!!!"
#        self.redirect('/')

class CounterWorker(webapp.RequestHandler):
    def get(self): # should run at most 1/s
        print "do-------------------------"
        isbn=self.request.get("isbn").encode("utf8")

        rpc = urlfetch.create_rpc()
        header={}
#        header["User-Agent"]="Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6"
#        header["Accept"]="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
#        header["Accept-Language"]="en-us,en;q=0.5"
#        header["Accept-Charset"]="ISO-8859-1,utf-8;q=0.7,*;q=0.7"
#        urlfetch.make_fetch_call(rpc, "http://www.amazon.cn/",method="GET")
        urlfetch.make_fetch_call(rpc, "http://www.dangdang.com/",method="GET")

        # ... do other things ...

#        try:
        result = rpc.get_result()
        print result.status_code
        if result.status_code == 200:
#            print "hhhha"
##            print result
#            print u"上下午"
#            print "-------------"
#            print "中国"
#            text = result.content
#            memcache.add("key2s", text, 600)
#
#            print type(text)
#            print text
##            text="中国"
##            print text
#            import sys
#            reload(sys)
#            sys.setdefaultencoding('utf8')
#            ccc=Counter(html=db.Text(text,'utf-8'),url="http://www.dangdang.com/",isbn=isbn)
            ccc=Counter(html=db.Text(result.content,"gb18030"),url="http://www.ng.com/",isbn="bbbbbbbbbbbbbb")

            ccc.put()
            # ...
#        except urlfetch.DownloadError:
#            # Request timed out or failed.
#            # ...
#            pass
#        print isbn

#        def txn():
#            counter = Counter.get_by_key_name(key)
#            if counter is None:
#                counter = Counter(key_name=key, count=1)
#            else:
#                counter.count += 1
#            counter.put()
#        db.run_in_transaction(txn)
        print "done"

def main():
    run_wsgi_app(webapp.WSGIApplication([
        ('/', CounterHandler),
        ('/worker', CounterWorker),
    ]))

if __name__ == '__main__':
    main()