python实现多个网站站内链接批量导出

最新推荐文章于 2022-11-22 11:31:57 发布

gig919

最新推荐文章于 2022-11-22 11:31:57 发布

阅读量811

点赞数

本文链接：https://blog.csdn.net/gig919/article/details/104179394

版权

本例多个网站的网址保存在”D:\test.xlsx”文件中第一列，代码运行结束后第二列导出站内链接，链接名保存在第三列，以下代码已经在Python27中调试通过：

#coding=utf-8
import requests
import os

from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl import load_workbook
from requests import exceptions

def get_html(url):
    '''
    headers = {
        'User-Agent': 'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\
    AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'
    } #模拟浏览器访问
    '''
    headers = {
        'User-Agent': 'Mozilla/5.0(Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)\
    like Gecko'
    }  # 模拟浏览器访问
    t1=30
    try:
        r = requests.get(url,timeout=t1, headers = headers)       #请求访问网站
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        #print(r.status_code)
        if r.status_code == 200:
           return r.text
        else:
            return ''
    except requests.RequestException as e:
        print(e)
        #print(url)
        return ''

def rexcel(excelFile): #读excel文件
    list1 = []
    if not os.path.exists(excelFile):
        print("文件不存在")
        return list1
    wb = load_workbook(excelFile)
    ws = wb.active
    for j in range(ws.max_row):
           r = j + 1
           list1.append(ws.cell(row=r, column=1).value)
    return  list1

def wexcel(excelFile,hr1,st1): #写excel文件
    if not os.path.exists(excelFile):
        print("文件不存在")
        return 0
    wb = load_workbook(excelFile)
    ws = wb.active
    #print len(hr1)
    #print hr1[]
    for row in range(len(hr1)):
        #print(row)
        r = row + 1
        ws.cell(row=r, column=2).value =hr1[row]
        ws.cell(row=r, column=3).value =st1[row]
        if row>1048575:
            break
    wb.save(excelFile)
    return  1



if __name__ == '__main__': #程序入口
#site = 'http://www.hengripumps.com/'
 excelFile = 'D:/test.xlsx'
 list1=rexcel(excelFile)
 hr1 = []
 st1 = []
 for m in range(len(list1)):
    site = list1[m]
    #print(site)
    if site == None:
        break
    else:
        print site

    if site[0:4]=='http':
      demo=get_html(site)
      if demo == '':
          hr1.append(site)
          st1.append(u'获取主页数据失败')
          continue

      soup = BeautifulSoup(demo,"html.parser")

      if site[-1] == '/':
          site = site[0:-1]
      #print site
      #print demo
      i = 0
      for a in soup.find_all('a'):
          i += 1
          if a.has_attr('href'):
              if (a['href'][0:4]!='http') and ('@'not in a['href']) and ('javascript:'not in a['href']) and ('javasrcipt:'not in a['href']) and ('tel:'not in a['href']):  #非站外链接且非正常链接
                urlg=a['href']
                if len(urlg)>0:
                 if urlg[0]!='/':
                     urlg = '/'+urlg
                urlhb = site +urlg
                if urlhb in hr1:#过虑重复链接
                    continue
                hr1.append(urlhb)
                st1.append(a.string)
      if i == 0:
          hr1.append(site)
          st1.append(u'该网页无链接')
 wexcel(excelFile, hr1,st1)

运行后的”D:\test.xlsx”文件如下图：
在这里插入图片描述
补充说明：出现爬取网页报错提示状态码404，往往是execl表中网址后有不可见字符造成的。

gig919

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python实现多个网站站内链接批量导出

本例多个网站的网址保存在”D:\test.xlsx”文件中第一列，代码运行结束后第二列导出站内链接，链接名保存在第三列，以下代码已经在Python27中调试通过：#coding=utf-8import requestsimport osimport httplib2from bs4 import BeautifulSoupfrom openpyxl import Workbookfr...
复制链接

扫一扫