python实现从一个网站中查找某个字段并保存结果的txt文件

最新推荐文章于 2023-08-31 06:51:37 发布

haimianjie2012

最新推荐文章于 2023-08-31 06:51:37 发布

阅读量961

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/haimianjie2012/article/details/101915614

版权

python 专栏收录该内容

32 篇文章 1 订阅

订阅专栏

需求说明：

有时候需要修改网站中某些字段，但是一个网站网页比较多，一个一个查找非常费时，因此是python语言写了一个小程序来查找网站中的字段，并将查找结果保存到txt文件中。

本文使用anaconda自带jupyter notebook实现。

上图是jupyter notebook打开后的样子，第三行是创建的文件保存位置，也是数据读取默认位置。

本文创建findWebStr.ipynb文件和保存的结果都在C:\Users\randongmei文件夹下：

findWebStr.ipynb源代码如下：

import requests
import os
from bs4 import BeautifulSoup
import webbrowser


def getHTMLText(url):
    '''
    此函数用于获取网页的html文档
    '''
    try:
        #获取服务器的响应内容，并设置最大请求时间为6秒
        res = requests.get(url, timeout = 6)
        #判断返回状态码是否为200
        res.raise_for_status()
        #设置该html文档可能的编码
        res.encoding = res.apparent_encoding
        #返回网页HTML代码
        return res.text
    except:
        return '产生异常'
    
    
def saveHtml(file_name, file_content):
    # 注意windows文件命名的禁用符，比如 /
    with open(file_name.replace('/', '_') + ".html", "wb") as f:
        # 写文件用bytes而不是str，所以要转码
        f.write(file_content.encode())
        
        

def findSubStrInURLOne(url,substr_name):
    '''
    在一个网页中查找某一个字段
    url:网页地址
    sub_str:查找的字段
    '''
    findRes = []
    demo = getHTMLText(url)
    #saveHtml("aa", demo)   
    #print(demo)
    #print(substr_name)
    #解析HTML代码
    soup = BeautifulSoup(demo, 'html.parser')

    #模糊搜索HTML代码的所有包含href属性的<a>标签
    a_labels = soup.find_all('a', attrs={'href': True}) 
    
    urlname = url.replace('://','_').replace('/','_').replace('.','_')
    
    
    #linkname = urlname+substr_name+'_link.txt'
    
    #linkfile = open(linkname,'wb')
    for alin in a_labels:
        #linkfile.write(alin.encode())        
        if substr_name in alin.text:
            findRes.append(alin)            
    #linkfile.close()
    
    meta_content = soup.find_all('meta',attrs={'content': True})
    for m in meta_content:        
        mm = m.get('content')
        #print(mm)
        if substr_name in mm:
            findRes.append(mm)
            #print(mm)
  
    #lipname= urlname+substr_name+'_lip.txt'    
    #lipfile = open(lipname,'wb')
    li_p = soup.select('li p')   
    for lipone in li_p:
        #print(lipone)
        #lipfile.write(lipone.encode())
        if substr_name in lipone.text:
            findRes.append(lipone)
            
    #lipfile.close()
   
    # hname= urlname+substr_name+'_h19.txt'
   
    #hfile =open(hname,'wb')
    for i in range(10):
        mstr = 'h'+str(i)
        habout = soup.select(mstr)
        for ha in habout:
            #print(ha)
           # hfile.write(ha.encode())
            if substr_name in ha.text:
                findRes.append(ha)
               
    #hfile.close()
    
  
    div = soup.select('p')
    #divname= urlname+substr_name+'_p.txt'    
    #divfile = open(divname,'wb')
    for di in div:
        #print(di)
        #divfile.write(di.encode())        
        if substr_name in di.text:
            findRes.append(di)
            #print(di)
   # divfile.close()
   
   
    findResname= urlname+substr_name+'_findRes.txt' 
    findFile = open(findResname,'wb')
    head = '网址：'+url+'\n'+'结果：\n'
    findFile.write(head.encode())
    enter = '\n\n'
    findFile.write(enter.encode())
   
    total = 0
    for i in findRes:
        total = total+1
        tmmark ='第'+str(total)+'个：\n'
        findFile.write(tmmark.encode())        
        findFile.write(i.encode())
        findFile.write(enter.encode())
    findFile.close()
        
    return findRes
    
  

def GetOneWebAllUrl(web_url):
    '''
    获得一个网站所有的网页地址
    '''
    res = []
    demo = getHTMLText(web_url)
    #saveHtml("aa", demo)   
       
    #解析HTML代码
    soup = BeautifulSoup(demo, 'html.parser')

    #模糊搜索HTML代码的所有包含href属性的<a>标签
    a_labels = soup.find_all('a', attrs={'href': True})    

    #获取所有<a>标签中的href对应的值，即超链接
    file = open('href.txt','w')
    head = '网站地址：'+web_url
    file.write(head)
    file.write('\n')
    num = 0
    for a in a_labels:   
        if '.html' in a.get('href'):  
            num = num+1
            if 'http:' not in a.get('href'):
                comurl = web_url+a.get('href')            
                #print(a.get('href'))                    
                file.write(a.get('href'))
                file.write('\n')
                file.write(comurl)
                file.write('\n')
                res.append(comurl)
                #print(isopen)  
            if 'http:' in a.get('href'):
                #isopen = webbrowser.open(a.get('href'))                
                #if isopen:
                #print(a.get('href'))
                file.write(a.get('href'))
                file.write('\n')
                res.append(a.get('href'))
                #print(isopen)  
      
    tail = '该网站包含网页总数：'+str(num)
    file.write(tail)
    file.close()
    return res



def main():
    '''
    主函数
    '''
    #目标网页，这个可以换成一个你喜欢的网站
    url = 'http://www.moptim.com/cn/'
    substr_name = "莫廷|我们|234|mainname"
   
    strlist = substr_name.split('|')   
    urlres = GetOneWebAllUrl(url)
    for urlone in urlres:
         for substr in strlist:     
                findRes=findSubStrInURLOne(urlone,substr)
        
      
        
main()

结果保存在类似名http_www_moptim_cn_cn_pro_view-10_html我们_findRes.txt文件中，html之前对应为网页地址，“我们”是当前查找的字段

该文件的意思就是：

网页http_www_moptim_cn_cn_pro_view-10_html中我们字段的查找结果。

支持多个字段查找比如，同时查找“我们”和“莫廷”，那么每个网页就会对应两个结果文件，“我们”字段一个txt文件，“莫廷”字段一个txt文件。

对应不同的需求，可以直接修改main函数对应的url和substr_name后面双引号内容。

url对应我们要查找定位的网站地址，

substr_name对应我们要查找的一个或多个字段，如果是多个字段，每个字段之间用“|”分开，例如要查找我们和莫廷字段，应该表示为：substr_name='我们|莫廷'

对先前的代码进行了改进，所有结果保存在一个txt文档里面，并且每个网址打印出对应标题

import requests
import os
from bs4 import BeautifulSoup
import webbrowser


def getHTMLText(url):
    '''
    此函数用于获取网页的html文档
    '''
    try:
        #获取服务器的响应内容，并设置最大请求时间为6秒
        res = requests.get(url, timeout = 6)
        #判断返回状态码是否为200
        res.raise_for_status()
        #设置该html文档可能的编码
        res.encoding = res.apparent_encoding
        #返回网页HTML代码
        return res.text
    except:
        return '产生异常'
    
    
def saveHtml(file_name, file_content):
    # 注意windows文件命名的禁用符，比如 /
    with open(file_name.replace('/', '_') + ".html", "wb") as f:
        # 写文件用bytes而不是str，所以要转码
        f.write(file_content.encode())
        
        

def findSubStrInURLOne(url,substr_name):
    '''
    在一个网页中查找某一个字段
    url:网页地址
    sub_str:查找的字段
    '''
    findRes = []
    demo = getHTMLText(url)
    #saveHtml("aa", demo)   
    # print(demo)
    #print(substr_name)
    #解析HTML代码
    soup = BeautifulSoup(demo, 'html.parser')
    
    titles = soup.select('title')
    #print(titles)
    for tit in titles:
        findRes.append(tit.text)

    #模糊搜索titletxtHTML代码的所有包含href属性的<a>标签
    a_labels = soup.find_all('a', attrs={'href': True}) 
    
    urlname = url.replace('://','_').replace('/','_').replace('.','_')
    
    
    #linkname = urlname+substr_name+'_link.txt'
    
    #linkfile = open(linkname,'wb')
    for alin in a_labels:
        #linkfile.write(alin.encode())        
        if substr_name in alin.text:
            findRes.append(alin)            
    #linkfile.close()
    
    meta_content = soup.find_all('meta',attrs={'content': True})
    for m in meta_content:        
        mm = m.get('content')
        #print(mm)
        if substr_name in mm:
            findRes.append(mm)
            #print(mm)
  
    #lipname= urlname+substr_name+'_lip.txt'    
    #lipfile = open(lipname,'wb')
    li_p = soup.select('li p')   
    for lipone in li_p:
        #print(lipone)
        #lipfile.write(lipone.encode())
        if substr_name in lipone.text:
            findRes.append(lipone)
            
    #lipfile.close()
   
    # hname= urlname+substr_name+'_h19.txt'
   
    #hfile =open(hname,'wb')
    for i in range(10):
        mstr = 'h'+str(i)
        habout = soup.select(mstr)
        for ha in habout:
            #print(ha)
           # hfile.write(ha.encode())
            if substr_name in ha.text:
                findRes.append(ha)
               
    #hfile.close()
    
  
    div = soup.select('p')
    #divname= urlname+substr_name+'_p.txt'    
    #divfile = open(divname,'wb')
    for di in div:
        #print(di)
        #divfile.write(di.encode())        
        if substr_name in di.text:
            findRes.append(di)
            #print(di)
   # divfile.close()
          
    return findRes
    
  

def GetOneWebAllUrl(web_url):
    '''
    获得一个网站所有的网页地址
    '''
    res = []
    demo = getHTMLText(web_url)
    #saveHtml("aa", demo)   
    #print(demo)
       
    #解析HTML代码
    soup = BeautifulSoup(demo, 'html.parser')

    #模糊搜索HTML代码的所有包含href属性的<a>标签
    a_labels = soup.find_all('a', attrs={'href': True})    

    #获取所有<a>标签中的href对应的值，即超链接
    file = open('href.txt','w')
    head = '网站地址：'+web_url
    file.write(head)
    file.write('\n')
    num = 0
    for a in a_labels:   
        if '.html' in a.get('href'):  
            num = num+1
            if 'http:' not in a.get('href'):
                comurl = web_url+a.get('href')            
                #print(a.get('href'))                    
                file.write(a.get('href'))
                file.write('\n')
                file.write(comurl)
                file.write('\n')
                res.append(comurl)
                #print(isopen)  
            if 'http:' in a.get('href'):
                #isopen = webbrowser.open(a.get('href'))                
                #if isopen:
                #print(a.get('href'))
                file.write(a.get('href'))
                file.write('\n')
                res.append(a.get('href'))
                #print(isopen)  
      
    tail = '该网站包含网页总数：'+str(num)
    file.write(tail)
    file.close()
    return res



def main():
    '''
    主函数
    '''
    #目标网页，这个可以换成一个你喜欢的网站
    url = 'http://www.moptim.com/cn/'
    #url = 'http://www.moptim.cn/cn/show-252.html'
    substr_name = "莫廷|我们|234|mainname"
   
    strlist = substr_name.split('|')  
    urlres = []
    if '.html'not in url:
        urlres = GetOneWebAllUrl(url)
    if '.html' in url:
        urlres.append(url)
    #print(urlres)
    
    urlname = url.replace('://','_').replace('/','_').replace('.','_')
    findResname= urlname+'_findRes.txt' 
    findFile = open(findResname,'wb')
    for urlone in urlres:
        fenge = '************************************************************************************************'
        findFile.write(fenge.encode())
        enter = '\n\n'
        findFile.write(enter.encode())  
        for substr in strlist:                
                findRes=findSubStrInURLOne(urlone,substr) 
                if len(findRes) > 1:
                    head = '网址：'+urlone+'\n'+'字段：'+substr+'\n'
                    findFile.write(head.encode())
                    total = 0
                    for i in findRes:
                        total = total+1
                        if total == 1:
                            tmmark ='标题：'
                        if total != 1:
                            tmmark = '第'+str(total-1)+'个结果：\n'
                        findFile.write(tmmark.encode())                      
                        findFile.write(i.encode())
                        findFile.write(enter.encode())
    findFile.close() 
        
main()