# coding:utf-8
from login163 import *
from xml.parsers import expat
import MySQLdb
class mail163(Login163):
'''
get 'limit' unread mails at once,the data format is xml
if 'subject' in xml data,then return the data,else return None
'''
def get_unread_mail(self,start,limit):
postdata = {
'var':'<?xml version="1.0"?><object><int name="fid">1</int><boolean name="skipLockedFolders">false</boolean><string name="order">date</string><boolean name="desc">true</boolean><int name="start">'+str(start)+'</int><int name="limit">'+str(limit)+'</int><boolean name="topFirst">false</boolean><object name="filterFlags"><boolean name="read">false</boolean></object><boolean name="returnTotal">true</boolean><boolean name="returnTag">true</boolean></object>'
}
postdata = urllib.urlencode(postdata)
url = 'http://twebmail.mail.163.com/js5/s?sid='+self.sid+'&func=mbox:listMessages&deftabclick=t2&deftabclick=undefined&from=toolbar&type=unread&mboxentry=1'
req = urllib2.Request(url=url,data=postdata,headers=self.headers)
res = urllib2.urlopen(req).read()
if 'subject' in res:
return res
else:
return None
'''
xml data format,then return the data
'''
def format(self,xml_data):
pattern = re.compile(r'<object name="ctrls">.*?</object>|<object name="flags" />|<object name="flags">.*?</object>',re.S)
xml_data = pattern.sub('',xml_data)
pattern = re.compile(r'<string name="from">.*?;(.*?@.*?)&.*?</string>')
xml_data = pattern.sub(r'<string name="from">\1</string>',xml_data)
pattern = re.compile(r'<string name="to">.*?;(.*?@.*?)&.*?</string>')
xml_data = pattern.sub(r'<string name="to">\1</string>',xml_data)
return xml_data
#db connect
class Db_Connect(object):
def __init__(self, db_host, user, pwd, db_name, charset="utf8", use_unicode = True):
try:
self.conn = MySQLdb.Connection(db_host, user, pwd, db_name, charset=charset , use_unicode=use_unicode)
except MySQLdb.OperationalError,e:
print 'Connect %s Failed' % db_host
print e.args
sys.exit(1)
def insert(self, sql):
try:
n = self.conn.cursor().execute(sql)
return n
except MySQLdb.Warning, e:
print e.args
except MySQLdb.IntegrityError,e:
print e.args
def close(self):
self.conn.close()
class Mail_Handler(object):
def __init__(self,data,db_conn):
self.flag = False # control the data update
self.mail = {} # a mail info
self.curr_attrib = ''
self.data = data # xml data
self.db_conn = db_conn
def start(self,name,attributes):
if name == 'object':
self.mail = {}
# get the value of the attribute
# <string name="id">sdosod0sdfsd</string>
# the value is "id"
values = attributes.values()
if len(values):
self.curr_attrib = values[0]
self.flag = True
def end(self,name):
sql = "insert into mails(id, from_mail, to_mail, subject, size) values('%s', '%s', '%s', '%s', %d)"
fields = ('id','from','to','subject','size')
if name == 'object':
#print self.mail
values = [self.mail[i] for i in fields]
values[-1] = int(values[-1]) # the size type is int
values = tuple(values)
#print values
#print sql % values
self.db_conn.insert(sql % values)
self.flag = False
def character(self,data):
if self.flag:
self.mail[self.curr_attrib] = data
def parser(self):
p = expat.ParserCreate()
p.StartElementHandler = self.start
p.EndElementHandler = self.end
p.CharacterDataHandler = self.character
p.Parse(self.data) # parse xml data
def main():
flag = True
db_conn = Db_Connect('192.168.110.142','admin','admin','test')
username = raw_input('Enter you email:')
password = getpass.getpass('Enter you password:')
login = mail163(username,password)
sid = login.login() # login the 163 mail for getting sid
# login success
if sid:
start = 0 # the start page
limit = 5 # read 5 unread mails at once
while flag:
res = login.get_unread_mail(start,limit)
if res is None:
flag = False
else:
res = login.format(res) # use re module format data
parser = Mail_Handler(res,db_conn) # use expat parse xml
parser.parser()
start += limit
db_conn.close()
if __name__ == '__main__':
main()
这段代码读取163邮箱未读邮件标题并将数据插入mysql数据库,使用了expat进行数据处理。Login163类是爬虫这一节当中的类。
转载于:https://blog.51cto.com/liuping0906/1404437