#coding=utf-8
import re
import urllib.request
import xlrd
import xlwt
def gethtml(url): #获取网页html
jiuaoopage=urllib.request.urlopen(url,timeout=10) #设置网页超时时间
html=jiuaoopage.read()
return html
def gettel(html): #利用正则表达式抓取电话号码
r=r’0\d{2,3}-\d{7,8}’ #抓取固定号码
fo=re.compile®
tel=fo.findall(html.decode(‘utf-8’,‘ignore’)) #ignaore不完全匹配
tel = list(set(tel)) #号码去重
return tel
readbook = xlrd.open_workbook(‘srcurl.xlsx’) #读取域名列表
sheet = readbook.sheet_by_index(0)
nrows = sheet.nrows
workbook = xlwt.Workbook(encoding=‘ascii’) #创建一个sheet,以便写入电话号码
worksheet = workbook.add_sheet(‘My Worksheet’)
for rowtag in range(0,nrows):
domainname = sheet.cell(rowtag, 0).value
try:
html=gethtml(domainname)
tel=gettel(html)
tel = " ".join(tel)
worksheet.write(rowtag,0,domainname) #写入域名
worksheet.write(rowtag,1,tel) #写入号码
workbook.save(‘selecttel.xls’)
print(domainname)
print(tel)
except urllib.error.HTTPError:
print(“页面无法访问”)
except urllib.error.URLError:
print(“页面无法访问”)
except Exception as e:
print(“出现异常:” + str(e))
else:workbook.save(‘selecttel.xls’)