# -*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import time
import sys
import re
import json
import MySQLdb
reload(sys)
sys.setdefaultencoding('utf-8')
urls = []
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
}
time1 = time.time()
for i in range(1,10000):
url = 'https://domain.com/video/' + str(i)
urls.append(url)
def spider(url):
html = requests.get(url, headers=head)
json_url = url
jsoncontent = requests.get(json_url, headers=head).content
jsDict = json.loads(jsoncontent)
if jsDict['code'] == 0:
jsData = jsDict['data']
vtitle = jsData['vtitle']
play_count = jsData['play_count']
date_add = jsData['date_add']
try:
conn = MySQLdb.connect(host='localhost', user='', passwd='',
port=3306, charset='utf8')
cur = conn.cursor()
conn.select_db('db')
cur.execute(
'INSERT INTO videos( url, json, vtitle, play_count,datetime) VALUES '
'(%s,%s,%s,%s,%s)',
[ str(json_url), str(jsData), vtitle,play_count,date_add])
conn.commit()
print "Succeed: " + str(urls)
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
else:
print "Error_Json: " + url
pool = ThreadPool(10)
# results = pool.map(spider, urls)
try:
results = pool.map(spider, urls)
except Exception, e:
# print 'ConnectionError'
print e
time.sleep(300)
results = pool.map(spider, urls)
pool.close()
pool.join()