本例多个网站的网址保存在”D:\test.xlsx”文件中第一列,代码运行结束后第二列导出站内链接,链接名保存在第三列,以下代码已经在Python27中调试通过:
#coding=utf-8
import requests
import os
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl import load_workbook
from requests import exceptions
def get_html(url):
'''
headers = {
'User-Agent': 'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\
AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'
} #模拟浏览器访问
'''
headers = {
'User-Agent': 'Mozilla/5.0(Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)\
like Gecko'
} # 模拟浏览器访问
t1=30
try:
r = requests.get(url,timeout=t1, headers = headers) #请求访问网站
r.raise_for_status()
r.encoding = r.apparent_encoding
#print(r.status_code)
if r.status_code == 200:
return r.text
else:
return ''
except requests.RequestException as e:
print(e)
#print(url)
return ''
def rexcel(excelFile): #读excel文件
list1 = []
if not os.path.exists(excelFile):
print("文件不存在")
return list1
wb = load_workbook(excelFile)
ws = wb.active
for j in range(ws.max_row):
r = j + 1
list1.append(ws.cell(row=r, column=1).value)
return list1
def wexcel(excelFile,hr1,st1): #写excel文件
if not os.path.exists(excelFile):
print("文件不存在")
return 0
wb = load_workbook(excelFile)
ws = wb.active
#print len(hr1)
#print hr1[]
for row in range(len(hr1)):
#print(row)
r = row + 1
ws.cell(row=r, column=2).value =hr1[row]
ws.cell(row=r, column=3).value =st1[row]
if row>1048575:
break
wb.save(excelFile)
return 1
if __name__ == '__main__': #程序入口
#site = 'http://www.hengripumps.com/'
excelFile = 'D:/test.xlsx'
list1=rexcel(excelFile)
hr1 = []
st1 = []
for m in range(len(list1)):
site = list1[m]
#print(site)
if site == None:
break
else:
print site
if site[0:4]=='http':
demo=get_html(site)
if demo == '':
hr1.append(site)
st1.append(u'获取主页数据失败')
continue
soup = BeautifulSoup(demo,"html.parser")
if site[-1] == '/':
site = site[0:-1]
#print site
#print demo
i = 0
for a in soup.find_all('a'):
i += 1
if a.has_attr('href'):
if (a['href'][0:4]!='http') and ('@'not in a['href']) and ('javascript:'not in a['href']) and ('javasrcipt:'not in a['href']) and ('tel:'not in a['href']): #非站外链接且非正常链接
urlg=a['href']
if len(urlg)>0:
if urlg[0]!='/':
urlg = '/'+urlg
urlhb = site +urlg
if urlhb in hr1:#过虑重复链接
continue
hr1.append(urlhb)
st1.append(a.string)
if i == 0:
hr1.append(site)
st1.append(u'该网页无链接')
wexcel(excelFile, hr1,st1)
运行后的”D:\test.xlsx”文件如下图:
补充说明:出现爬取网页报错提示状态码404,往往是execl表中网址后有不可见字符造成的。