爬虫项目总结

最新推荐文章于 2023-05-10 16:44:51 发布

VIP文章 POWERFULU

最新推荐文章于 2023-05-10 16:44:51 发布

阅读量2.2k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_38663663/article/details/108324956

版权

爬虫项目使用手册

项目1 爬取ChemicalBook

爬取化合物列表

爬取代码：chemical.py
输出文件: data.xls
爬取化合物具体信息

爬取代码：pagedata.py
输出文件: pagedata.txt

1.1 爬取CAS号、中文名、英文名、分子式代码

    # -*- coding: utf-8 -*-
    """
    Created on Tue Jul 21 09:49:56 2020
    
    @author: JX
    """
    
    import requests
    from bs4 import BeautifulSoup
    import re
    import xlwt
    
    headers = {
   
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    
    base_url = [
        'https://www.chemicalbook.com/CASDetailList_{}.htm'.format(i) for i in range(0, 101, 100)
    ]
    print(base_url)
    finds1 = re.compile(r'<a class="blue" href="/CAS.*?">(.*?)</a>',re.S)
    finds2 = re.compile(r'<a class="blue" href="/ChemicalProductProperty_CN_.*">(.*?)</a>', re.S)
    finds3 = re.compile(r'<td width="380">(.*?)</td>', re.S)
    finds4 = re.compile(r'<span id="ContentPlaceHolder1_ProductClassDetail_.*">(.*?)</span>', re.S)
    
    
    
    def getData():
        datalist = []
        for url in base_url:  # 设置循环
    
            print('第{}页'.format(url))
            page = requests.get(url)
            # print(page.status_code)
            soup = BeautifulSoup(page.content, 'html.parser')
            # print(soup.prettify())
            for tr in soup.find_all('tr'):
                data = []
                tr = str(tr)
                tr = re.sub('\r\n', " ", tr)  # 替换/
                s1 = re.findall(finds1, tr)
                if s1 != []:
                    data.append(s1[0])
                s2 = re.findall(finds2, tr)
                if s2 != []:
                    data.append(s2[0])
                s3 = re.findall(finds3, tr)
                if s3 != []:
                    data.append(s3[0])
                s4 = re.findall(finds4, tr)
                if s4 != []:
                    data.append(s4[0])
    
                # print(data)
                datalist.append(data)
    
        return (datalist)
    
    
    def saveData(datalist, savepath):
        print("save......")
        book = xlwt.Workbook(encoding="utf-8", style_compression=0)
        sheet = book.add_sheet('IPA', cell_overwrite_ok=True)
        col = ("CAS", "中文名", "英文名", "MF")
        for i in range(0, 4):
            sheet.write(0, i, col[i])
        for i in range(0, 32652):
            # print("第%d条" %(i+1))
            data = datalist[i]
            # print(len(data))
            if data != []:
                for j in range(0, 4):
                    sheet.write(i + 1, j, data[j])
    
        book.save(savepath)
        
    if __name__ == "__main__":
        datalist = getData()
        print(datalist)
        savepath = ".\\data1.xls"
        del(datalist[0])
        #saveData(datalist,savepath)
        print("爬取完毕")

爬取单页数据

    # -*- coding: utf-8 -*-
    """
    Created on Wed Jul 22 09:20:31 2020
    
    @author: JX
    """
    
    import requests
    import re
    from bs4 import BeautifulSoup
    headers = {
   'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    url = "https://www.chemicalbook.com/CAS_5446-18-4.htm"
    
    #保存资源 
    def save_contents(urlist): 
        with open("./data.txt",'a+',encoding = 'utf-8') as f: 
            for i in urlist:
                f.write(i)
            #f.write(' ')
    
    page = requests.get(url)
    newp=page.text.replace('<br />','')
    
    print(page.status_code)
    soup = BeautifulSoup(page.content, 'html.parser')
    trs = soup.find_all('div',id="ContentPlaceHolder1_SubClass")
    for tr in trs:
        for td in tr.stripped_strings:
            #print(td)
            save_contents(td)  
    
    with open('data.txt','r',encoding='utf-8') as f:        
        dic=[]
        for line in f.readlines():
            #line = str(line).replace("\n","")
            b=re.split('【',line)
            dic.append(b)
            
    dic=str(dic)
    #save_contents(str(dic))  
    dic = re.sub('】',":",dic)
    print(dic)

项目2 爬取IPA数据库，单页数据的获取代码

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jul 23 16:31:25 2020
    
    @author: JX
    """
    
    from bs4 import BeautifulSoup
    import re
    import urllib.request,urllib.error
    import xlwt
    import unicodedata
    
    finds0 = re.compile(r'<td class="tableheadbkgr">(.*?)</td>',re.S)
    finds1 = re.compile(r'<td class="b1" width="715">(.*?)</td>',re.S)
    finds2 = re.compile(r'<td class="a1" width="715">(.*?)</td>',re.S)
    finds3 = re.compile(r'<td align="left" class="b1" width="715">(.*?)</td>',re.S)
    finds4 = re.compile(r'<td align="left" class="a1" width="715">(.*?)</td>',re.S)
    
    def remove(tr):
        tr = re.sub('<br(\s+)?/>(\s+)?'," ",tr)
        tr = re.sub('<sub(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('</sub(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('<a.*?>'," ",tr)
        tr = re.sub('</a(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('<span(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('<span class="tableinstructional0">', "", tr)
        tr = re.sub('<span id="intNetworkLink"', "1", tr)
        tr = re.sub('</span(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('--',"",tr)
        tr = re.sub('Interaction', "", tr)
        tr = re.sub('Network', "", tr)
        tr = re.sub('&gt;', "", tr)
        tr = re.sub('1>', "", tr)
        tr = re.sub('IPA Chem View:', "", tr)
        tr = unicodedata.normalize('NFKC', tr)
        tr = tr.replace('\n', "")
        return tr
    
    def getData(url):
        data = []
        kong = []
        fp = open(url,'r',encoding='utf-8')
        soup = BeautifulSoup(fp,'html.parser')
        res0 = str(soup.find('td', class_="tableheadbkgr"))
        res0 = remove(res0)
        s0 = re.findall(finds0, res0)
        #print(s0)
        data.append(s0)
        res = soup.find_all('table',class_="tablenodeviewcontainer")
        for tr in res:    
            tr = str(tr)
            tr = remove(tr)
            s1 = re.findall(finds1,tr)
            s1 = [[i,] for i in s1]
            if len(s1) == 8:
                for i in range(len(s1)):
                    data.append(s1

最低0.47元/天解锁文章

POWERFULU

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
爬虫项目总结

爬虫项目使用手册项目1 爬取ChemicalBook爬取化合物列表爬取代码：chemical.py输出文件: data.xls爬取化合物具体信息爬取代码：pagedata.py输出文件: pagedata.txt1.1 爬取CAS号、中文名、英文名、分子式代码 # -*- coding: utf-8 -*- """ Created on Tue Jul 21 09:49:56 2020 @author: JX """
复制链接

扫一扫