#!/usr/local/bin/python#-*- coding: utf-8 -*-#Python: 3.5#Author: zhenghai.zhang@xxx.com#Program: 爬取豆瓣网站上所有电视剧的名称并写入数据库。#Version: 0.1#History: 2017.11.01
importrequests,time, pymysql, re, datetime, jsonfrom exchangelib importDELEGATE, Account, Credentials, Message, Mailbox, HTMLBody
host= ‘xxx‘user= ‘xxx‘passwd= ‘xxx‘dbme= ‘crawl‘dbtarget= ‘back_brace‘table= ‘tv_hotwords‘tabledelta= ‘tv_hotwords_delta‘tablesync= ‘slot_value‘port= 3306tolist= [‘zhenghai.zhang@xxx.com‘]
defget_tvs(urlbase, page):try:
url= urlbase +str(page)print(url)try:
result=requests.get(url).text
jresult=json.loads(result)
tvs= jresult.get(‘subjects‘)except:print(‘爬取‘ + urlbase + str(page) + ‘失败!‘)
time.sleep(2)returntvsexcept:print(‘获取第%s页电影列表失败‘ %page)deftv_insert(host, user, passwd, dbme, port, table, tvs_list):
conn= pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")
cur=conn.cursor()
new_tvs=[]
punc= "!??"#$%&'()*+,-/:;<=>@[\]^_`{|}~?????、〃》「」『』【】〔〕〖〗?????〝〞????–—‘’?“”??…?﹏.()::。·"punctuation=puncfor tv intvs_list:try:
tv[‘title‘] = re.sub(r"[%s]+" % punctuation, "", tv.get(‘title‘))
cmd= ‘insert into %s(tv_id, tv_name) values("%s", "%s")‘ %(
table, tv.get(‘id‘), tv.get(‘title‘))
cur.execute(cmd)
new_tvs.append(tv)except:print(" " * 20, tv.get(‘title‘), "already exists, skip……")
cur.close()
conn.commit()
conn.close()returnnew_tvsdeftv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, tvs_list, tablesync):
conn= pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")
cur=conn.cursor()
cur.execute("delete from %s" % dbme+"."+tabledelta)for tv intvs_list:try:
cmd= ‘insert into %s(tv_id, tv_name) values("%s", "%s")‘ % (tabledelta, tv[‘id‘], tv[‘title‘])
cmdsync= ‘insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s")‘ % (dbtarget+"."+tablesync, "xxxxxx", tv[‘title‘], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "")
cur.execute(cmd)
cur.execute(cmdsync)exceptpymysql.Error:print(" " * 20, tv[‘title‘], "already exists, skip……")
cur.close()
conn.commit()
conn.close()deftv_new_to_release(host, user, passwd, dbtarget, port):
conn= pymysql.connect(host=host, user=user, passwd=passwd, db=dbtarget, port=port, charset="utf8")
cur=conn.cursor()try:
cmdbacktoskill= ‘insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")‘ % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
cmdskilltoskillpro= ‘insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")‘ % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))print(cmdbacktoskill)
cur.execute(cmdbacktoskill)print(cmdskilltoskillpro)
cur.execute(cmdskilltoskillpro)exceptpymysql.Error:print("write into back_brace.release_task error!!!")
cur.close()
conn.commit()
conn.close()defEmail(to, subject, body):
creds=Credentials(
username=‘xxxxxx‘,
password=‘xxxxxx‘)
account=Account(
primary_smtp_address=‘xxx@xxx.com‘,
credentials=creds,
autodiscover=True,
access_type=DELEGATE)
m=Message(
account=account,
subject=subject,
body=HTMLBody(body),
to_recipients=[Mailbox(email_address=to)])
m.send_and_save()if __name__ == ‘__main__‘:
update_tvs=[]
pages= 25 #豆瓣每项电视剧只有前500部
#美剧 英剧 韩剧 日剧 国产剧 港剧 日本动画 综艺
urlbaselist = [‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BE%8E%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E8%8B%B1%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E9%9F%A9%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E5%9B%BD%E4%BA%A7%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%B8%AF%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E6%9C%AC%E5%8A%A8%E7%94%BB&sort=recommend&page_limit=20&page_start=‘,‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BB%BC%E8%89%BA&sort=recommend&page_limit=20&page_start=‘]for urlbase inurlbaselist:for i inrange(pages):print("*"*30, i, "*"*30)
tvs_list= get_tvs(urlbase, i * 20)
new_tvs=tv_insert(host, user, passwd, dbme, port, table, tvs_list)for tv innew_tvs:print(tv[‘title‘],"Added")
onetv={}
onetv["id"] = tv["id"]
onetv["title"] = tv["title"]
update_tvs.append(onetv)
time.sleep(1)print(update_tvs)try:
tv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_tvs, tablesync)#将增加的电影写入movie_hotwords_delta表中
except:print("tv update and sync Error!")try:
tv_new_to_release(host, user, passwd, dbtarget, port)except:print("tv_new_to_release error!!!")
subject= ‘本次新增电视剧名称‘body= "本次新增的电影名称为:
"
for movie inupdate_tvs:
body+= movie[‘title‘] + "
"
for to intolist:
Email(to, subject, body)