需要:Python,easy-install,pip,wxpy
Python为语言编写环境。easy-install为Python包管理工具,是安装pip的前置需求。使用pip安装通过Python控制微信的wxpy库。wxpy库请于GitHub搜索。
# -*- coding: UTF-8 -*-
#author: 不饮者
#function: crawl SJTU scholarship news. Inform you when news are updated.
#details:crawl every 5 minutes, maximum runtime is 5 hours.
#!!!WARNING!!!WARNING!!!WARNING!!!WARNING!!!WARNING!!!WARNING!!!
#This program uses Web Wechat. Thus your phone Wechat's receiving function is stopped!
from wxpy import *
import urllib
import urllib2
import re
import time
import sys
#prepare for Chinese character reading & writing
reload(sys)
sys.setdefaultencoding('utf-8')
try:
# connect Web WeChat
bot = Bot()
bot.file_helper.send('Hello.')
bot.file_helper.send('Wechat robot, me.')
bot.file_helper.send('Informed when sholarship news updated, you.')
bot.file_helper.send('Scholarship Watcher, started.')
#initialization
currentNews = []
currentHref = []
newsCnt = 0
loopCnt = 1
url = r"http://xsb.seiee.sjtu.edu.cn/xsb/list/611-1-20.htm"#SJTU scholarship webpage
for i in range(60):#maximum runtime: 5 hours
# get current news and href
request = urllib2.Request(url)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
text = 'style="overflow-x:hidden;" href="(.*?)" title="(.*?)" target'
pattern = re.compile(text, re.S)
items = re.findall(pattern, content)
for item in items:
if newsCnt == 20:
break
href = 'http://xsb.seiee.sjtu.edu.cn' + item[0]
news = item[1]
news = news.decode('utf-8')
currentNews.append(news)
currentHref.append(href)
newsCnt += 1
# get old news and href
newsList = open(r'...\newsList.txt', 'r+')#news information file address
oldText = newsList.read()
oldText = oldText.decode("utf-8")
oldText = oldText.split('\n')
# find difference
if len(oldText) == 40:
oldFirstNews = oldText[0]
for i in range(20):
if (currentNews[i] == oldFirstNews) or (currentNews[i] == oldFirstNews[1:]):
# SOMETIMES txt file reading will add an empty character
break
else:
bot.file_helper.send('Update, Scholarship News!')
bot.file_helper.send(currentNews[i] + ' ' + currentHref[i])
else:#exception handling
bot.file_helper.send('Original txt file, not correctly prepared.')
bot.file_helper.send('Txt file, rebuilt.')
bot.file_helper.send('First three news, now:')
fhText = u''
for i in range(3):
fhText += (currentNews[i]+' '+currentHref[i]+'\n')
bot.file_helper.send(fhText)
# update information
newsList.seek(0)
newsList.truncate()
for i in range(19):
newsList.write(currentNews[i])
newsList.write('\n')
newsList.write(currentHref[i])
newsList.write('\n')
newsList.write(currentNews[19])
newsList.write('\n')
newsList.write(currentHref[19])
newsList.close()
#set interval behaviours
print 'Target webpage has been crawled for',loopCnt,'times'
loopCnt += 1
time.sleep(300)
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason