爬取的数据来源
爬取我简单使用了python 的 scrapy
main.py
import os
import platform
import tornado
from config.setting import app_port
from my_tornado.handler import _make_app
# 在window平台的兼容
def _on_window():
if platform.system() == "Windows":
import asyncio
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
# 运行中
def run():
_on_window()
app = _make_app()
app.listen(app_port)
tornado.ioloop.IOLoop.current().start()
os.path.join(os.path.dirname(__file__),"templates")
if __name__ == "__main__":
run()
# http://localhost:8883/random/news/?count=4
# http://localhost:8883/random/musics/?count=4
# http://localhost:8883/random/images/?count=4
setting.py
# mongo db 数据库配置信息
mongo_host = '127.0.0.1'
mongo_port = 27017
app_port = 8883
# 音乐
music_db_name = 'douban'
music_db_collection = 'music'
# 新闻
news_db_name = "yang_news"
news_db_collection = "home_news"
# 图片
image_db_name = 'images'
image_db_collection = 'music_image'
dbUtils.py
import random
import pymongo
# 数据总记录个数
from config.setting import mongo_host, mongo_port, music_db_name, music_db_collection, news_db_name, news_db_collection, \
image_db_name, image_db_collection
# 记录记录数量
_db_data_size = 0
# 获得数据表的引用
def _get_db(db_name,db_collection):
# 创建数据库
client = pymongo.MongoClient(host=mongo_host, port=mongo_port)
db = client[db_name]
table = db[db_collection]
# 数据记录个数
global _db_data_size
_db_data_size = table.find().count()
return table
# 获取音乐
def random_musics(count):
db = _get_db(music_db_name,music_db_collection)
data = []
for i in range(1, count+1):
tmp = db.find().limit(-1).skip(random.randint(0, _db_data_size - 1)).next()
data.append(tmp)
return data
# 获取新闻
def random_news(count):
db = _get_db(news_db_name,news_db_collection)
data = []
for i in range(1, count+1):
tmp = db.find().limit(-1).skip(random.randint(0, _db_data_size - 1)).next()
data.append(tmp)
return data
# 获取图片
def random_images(count):
db = _get_db(image_db_name,image_db_collection)
data = []
for i in range(1,count+1):
tmp = db.find().limit(-1).skip(random.randint(0,_db_data_size-1)).next()
data.append(tmp['image_url'])
return data
handle.py
import json
import os
import platform
import tornado.ioloop
import tornado.web
from pymongo import settings
from utils.dbUtils import random_musics, random_news, random_images
# 错误页面
class MainHandler(tornado.web.RequestHandler):
@tornado.gen.coroutine
def get(self):
self.render('../html/index.html')
# 获取音乐
class RandomMusicHandler(tornado.web.RequestHandler):
# 一次性最大可请求的资源数目
max_count = 100
@tornado.gen.coroutine
def get(self):
arg = self.get_argument('count')
if self.max_count < int(arg):
self.render('../html/index.html')
return
# # 解析数据编程json格式
json_str = {}
data = []
list = random_musics(int(arg))
json_str['size'] = len(list)
for i in list:
tmp = {}
tmp['music_url'] = i['music_url']
tmp['music_title'] = i['music_title']
tmp['music_size'] = i['music_size']
tmp['music_redirect_url'] = i['music_redirect_url']
data.append(tmp)
json_str['content'] = data
ans = json.dumps(json_str)
self.write(ans)
# 获取新闻
class RandomNewsHandler(tornado.web.RequestHandler):
# 一次性最大可请求的资源数目
max_count = 100
@tornado.gen.coroutine
def get(self):
arg = self.get_argument('count')
if self.max_count < int(arg):
self.render('../html/index.html')
return
# 解析数据编程json格式
json_str = {}
data = []
list = random_news(int(arg))
json_str['total'] = len(list)
for i in list:
tmp = {}
tmp['title'] = i['title']
tmp['url'] = i['url']
tmp['image'] = i['image']
tmp['image2'] = i['image2']
tmp['keywords'] = i['keywords']
tmp['brief'] = i['brief']
tmp['focus_date'] = i['focus_date']
data.append(tmp)
json_str['data'] = data
ans = json.dumps(json_str)
self.write(ans)
# 获取图片
class RandomImageHandler(tornado.web.RequestHandler):
# 一次最多请求数量
max_count = 100
@tornado.gen.coroutine
def get(self):
arg = self.get_argument('count')
if arg is None:
return
if int(arg) > self.max_count:
self.render('../html/index.html')
return
list = random_images(int(arg))
json_str ={}
json_str['total'] = len(list)
json_str['data'] = list
result = json.dumps(json_str)
self.write(result)
def write_error(self,state_code,**kwargs):
msg = get_error_msg(False,'请求失败',state_code)
self.write(msg)
settings = {
"static_path": os.path.join(os.path.dirname(__file__), "static"),
}
# 错误相应信息
def get_error_msg(right,instructions,state_code):
msg = {}
msg['ok'] = right
msg['instructions'] = instructions
msg['state_code'] = state_code
return msg
# handle设置
def _make_app():
return tornado.web.Application([
(r"/", MainHandler),
(r"/random/musics/", RandomMusicHandler),
(r"/random/news/", RandomNewsHandler),
(r"/random/images/", RandomImageHandler),
],**settings)
首先,这个resful有很多不足,特别是安全方面,今后逐渐完善,使其健壮。