#coding = gb2312
#from HTMLParser import HTMLParser
from HTMLParser import *
#import HTMLParser
import urllib
import sys
import time
import MySQLdb
#html_parser = HTMLParser.HTMLParser()
def db_insert_77169(id,sub_id,data,link):
conn = MySQLdb.connect(host='localhost',user='root',passwd='111111',db='sample',port=3306,charset='gb2312')
cur = conn.cursor()
#count = cur.execute('select * from botsample')
#values=(id,data,link)
values=[id,str(sub_id),data,link]
cur.execute('insert into 77169_download_info values(%s,%s,%s,%s)',values)
conn.commit()
cur.close()
conn.close()
def db_insert_77169_temp(id,sub_id):
conn = MySQLdb.connect(host='localhost',user='root',passwd='111111',db='sample',port=3306,charset='gb2312')
cur = conn.cursor()
#count = cur.execute('select * from botsample')
#values=(id,data,link)
#values=[id,str(sub_id),data,link]
cur.execute('insert into 77169_download_info_temp values(%s,%s)',[id,str(sub_id)])
conn.commit()
cur.close()
conn.close()
class parselinks(HTMLParser):
def __init__(self):
self.data=[]
self.link=[]
self.href=0
self.linkname=''
self.linkname_temp=''
HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag =='a':
if len(attrs)==2:
for name,value in attrs:
if name == 'href':
self.linkname_temp = value
self.href=1
def handle_data(self,data):
# print data
if self.href:
self.linkname+=data
def handle_endtag(self,tag):
if tag=='a':
self.linkname=''.join(self.linkname.split())
self.linkname=self.linkname.strip()
if self.linkname:
self.data.append(self.linkname)
self.link.append(self.linkname_temp)
self.linkname_temp = ''
self.linkname=''
self.href=0
def getresult(self):
#print len(self.data)
#print len(self.link)
#for value in self.data:
if len(self.data)==len(self.link):
for i in range(len(self.data)):
# value_local = '\xb1\xbe\xb5\xd8\xcf\xc2\xd4\xd8'
#value1 ="本地下载"
#if not cmp(value_local,value):
try:
db_insert_77169(total_id,i,self.data[i],self.link[i])
except:
db_insert_77169_temp(total_id,i)
#import HTMLParser
#html_parser = HTMLParser.HTMLParser()
#temp_link = html_parser.unescape(self.link[i])
#self.link[i] = html_parser.unescape(self.link[i])
#db_insert(total_id,i,self.data[i],temp_link)
#exit()
#time.sleep(1)
#print total_id,i,self.data[i],self.link[i]
if __name__=="__main__":
#print __name__
#total_id = 0
for total_id in range(0,70000):
try:
url_link = "http://soft.aaaaa.com/HTML/" + str(total_id) + ".html"
IParser = parselinks()
data_dl = urllib.urlopen(url_link).read()
#print data_dl
IParser.feed(data_dl)
IParser.getresult()
IParser.close()
print total_id
#time.sleep(1)
except:
continue