python learning 基础爬虫，获取网页上所有的链接地址

最新推荐文章于 2023-12-20 16:13:32 发布

一只代码汪~-~

最新推荐文章于 2023-12-20 16:13:32 发布

阅读量485

点赞数

文章标签： python3

本文链接：https://blog.csdn.net/cx2411/article/details/85253578

版权

自学python3 两三周的时候写的，欢迎改进指教。

# 导入文件库
import datetime
import ssl
import urllib
import urllib.request
from tool import johntool
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import sys
import time
class webspider:

    def getPageContent(url='',html='html.parser',headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'}):
        try:
            ssl._create_default_https_context = ssl._create_unverified_context
            # html_doc = urllib.request.urlopen(url)
            # html_doc = html_doc.read()
            html_doc = urllib.request.Request(url, headers=headers)
            html_doc = urllib.request.urlopen(html_doc).read()
            time.sleep(3)
            # page_source = requests.get(url).content
            return html_doc

        except:
            errorinfo = str(sys.exc_info())
            writelog = johntool.johntool.writeFile(errorinfo, path=sys.path[0] + '\log\\', file='log_' + time.strftime('%Y-%m-%d', time.localtime(time.time())) + '.log', isAddInTime=True)
            return False

    # 蜘蛛爬虫
    def reptilianSpider(url,html='html.parser'):
        """Extract html content."""
        bs_source =  webspider.getPageContent(url=url, html=html)
        bs_source = BeautifulSoup(bs_source, html)
        data=[]
        pattern = '<a.*?href="(.+)".*?>(.*?)</a>'
        i = 0;
        hrefText = []
        hrefLink = []
        nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        for link in bs_source.findAll('a'):
            link_text = ''
            href_link =''
            link_text = link.get_text()
            link_text = link_text.strip()
            href_link = link.get('href')

            # 去除不是字符串的数据 去除javascript 本地链接 去除href_link=/
            domainName  = ''
            if(isinstance(href_link,str)) and  href_link.find('javascript:')==-1 and href_link!='/' and href_link!='':
                # 判断字符串中是否包含域名HTTP:||HTTPS https://www
                urlNoHttp  = urlparse(url).netloc
                httpHeaer =  ''
                if(url[0:7]=='http://'):
                    httpHeaer =  'http://'
                elif(url[0:8]=='https://'):
                    httpHeaer = 'https://'

                httpParseNetloc = httpHeaer + urlNoHttp
                domainName = url
                # print(href_link)
                # 判断是否是抓取域名
                if (href_link.find(urlNoHttp)!=-1):    # if判断非抓取域名处理
                    if(href_link[0:2]=='//'and  href_link[0:8]!='https://' and  href_link[0:10]!='http://www' and href_link[0:10]!='https://www' and href_link.find('www.')==-1 ):
                        href_link = url + href_link[2:]
                    else:
                        if (href_link[0:2] == '//' and href_link.find('www.')==-1):
                            href_link = url + href_link[2:]
                        elif (href_link.find('www.') != -1 and href_link[0:2] == '//'):
                            href_link = httpHeaer + href_link[2:]
                        elif(href_link[0:7]!='http://' and  href_link[0:8]!='https://' and  href_link[0:10]!='http://www' and href_link[0:10]!='https://www' and href_link.find('www.')==-1 and  href_link[0:2]!='//'):
                            # 特别处理内部页面链接
                            href_link = httpParseNetloc + href_link
                else:
                    if(href_link[0:2]=='//' and href_link.find('www.')==-1):
                        href_link = url + href_link[2:]
                    elif(href_link[0:7]!='http://' and  href_link[0:8]!='https://' and  href_link[0:10]!='http://www' and href_link[0:10]!='https://www' and href_link.find('www.')==-1 and href_link[0:2]!='//' ):
                        # print(href_link)
                        # 去掉所有可能子页面处理
                        href_link = httpParseNetloc + href_link

                # 去除连接中可能出现的/R/N
                href_link =  href_link.replace('\r','').replace('\n','').replace('JavaScript:void(0)','').replace(' ','')
                link_text =  link_text.replace('\r','').replace('\n','').replace(' ','')

                href = {'hrefText':link_text,'hrefLink':href_link,'lastTime':0,'storageTime':nowTime,'isDone':False,'runing':False,'grabAddress':domainName}
                # data[i] = href
                data.append(href)
            i+=1
        return data

一只代码汪~-~

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python learning 基础爬虫，获取网页上所有的链接地址

自学python3 两三周的时候写的，欢迎改进指教。# 导入文件库import datetimeimport sslimport urllibimport urllib.requestfrom tool import johntoolfrom bs4 import BeautifulSoupfrom urllib.parse import urlparseimport sys...
复制链接

扫一扫