爬虫项目使用手册
项目1 爬取ChemicalBook
-
爬取化合物列表
爬取代码:chemical.py
输出文件: data.xls -
爬取化合物具体信息
爬取代码:pagedata.py
输出文件: pagedata.txt
1.1 爬取CAS号、中文名、英文名、分子式代码
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 21 09:49:56 2020
@author: JX
"""
import requests
from bs4 import BeautifulSoup
import re
import xlwt
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
base_url = [
'https://www.chemicalbook.com/CASDetailList_{}.htm'.format(i) for i in range(0, 101, 100)
]
print(base_url)
finds1 = re.compile(r'<a class="blue" href="/CAS.*?">(.*?)</a>',re.S)
finds2 = re.compile(r'<a class="blue" href="/ChemicalProductProperty_CN_.*">(.*?)</a>', re.S)
finds3 = re.compile(r'<td width="380">(.*?)</td>', re.S)
finds4 = re.compile(r'<span id="ContentPlaceHolder1_ProductClassDetail_.*">(.*?)</span>', re.S)
def getData():
datalist = []
for url in base_url: # 设置循环
print('第{}页'.format(url))
page = requests.get(url)
# print(page.status_code)
soup = BeautifulSoup(page.content, 'html.parser')
# print(soup.prettify())
for tr in soup.find_all('tr'):
data = []
tr = str(tr)
tr = re.sub('\r\n', " ", tr) # 替换/
s1 = re.findall(finds1, tr)
if s1 != []:
data.append(s1[0])
s2 = re.findall(finds2, tr)
if s2 != []:
data.append(s2[0])
s3 = re.findall(finds3, tr)
if s3 != []:
data.append(s3[0])
s4 = re.findall(finds4, tr)
if s4 != []:
data.append(s4[0])
# print(data)
datalist.append(data)
return (datalist)
def saveData(datalist, savepath):
print("save......")
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet('IPA', cell_overwrite_ok=True)
col = ("CAS", "中文名", "英文名", "MF")
for i in range(0, 4):
sheet.write(0, i, col[i])
for i in range(0, 32652):
# print("第%d条" %(i+1))
data = datalist[i]
# print(len(data))
if data != []:
for j in range(0, 4):
sheet.write(i + 1, j, data[j])
book.save(savepath)
if __name__ == "__main__":
datalist = getData()
print(datalist)
savepath = ".\\data1.xls"
del(datalist[0])
#saveData(datalist,savepath)
print("爬取完毕")
- 爬取单页数据
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 09:20:31 2020
@author: JX
"""
import requests
import re
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
url = "https://www.chemicalbook.com/CAS_5446-18-4.htm"
#保存资源
def save_contents(urlist):
with open("./data.txt",'a+',encoding = 'utf-8') as f:
for i in urlist:
f.write(i)
#f.write(' ')
page = requests.get(url)
newp=page.text.replace('<br />','')
print(page.status_code)
soup = BeautifulSoup(page.content, 'html.parser')
trs = soup.find_all('div',id="ContentPlaceHolder1_SubClass")
for tr in trs:
for td in tr.stripped_strings:
#print(td)
save_contents(td)
with open('data.txt','r',encoding='utf-8') as f:
dic=[]
for line in f.readlines():
#line = str(line).replace("\n","")
b=re.split('【',line)
dic.append(b)
dic=str(dic)
#save_contents(str(dic))
dic = re.sub('】',":",dic)
print(dic)
项目2 爬取IPA数据库,单页数据的获取代码
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 23 16:31:25 2020
@author: JX
"""
from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
import xlwt
import unicodedata
finds0 = re.compile(r'<td class="tableheadbkgr">(.*?)</td>',re.S)
finds1 = re.compile(r'<td class="b1" width="715">(.*?)</td>',re.S)
finds2 = re.compile(r'<td class="a1" width="715">(.*?)</td>',re.S)
finds3 = re.compile(r'<td align="left" class="b1" width="715">(.*?)</td>',re.S)
finds4 = re.compile(r'<td align="left" class="a1" width="715">(.*?)</td>',re.S)
def remove(tr):
tr = re.sub('<br(\s+)?/>(\s+)?'," ",tr)
tr = re.sub('<sub(\s+)?>(\s+)?'," ",tr)
tr = re.sub('</sub(\s+)?>(\s+)?'," ",tr)
tr = re.sub('<a.*?>'," ",tr)
tr = re.sub('</a(\s+)?>(\s+)?'," ",tr)
tr = re.sub('<span(\s+)?>(\s+)?'," ",tr)
tr = re.sub('<span class="tableinstructional0">', "", tr)
tr = re.sub('<span id="intNetworkLink"', "1", tr)
tr = re.sub('</span(\s+)?>(\s+)?'," ",tr)
tr = re.sub('--',"",tr)
tr = re.sub('Interaction', "", tr)
tr = re.sub('Network', "", tr)
tr = re.sub('>', "", tr)
tr = re.sub('1>', "", tr)
tr = re.sub('IPA Chem View:', "", tr)
tr = unicodedata.normalize('NFKC', tr)
tr = tr.replace('\n', "")
return tr
def getData(url):
data = []
kong = []
fp = open(url,'r',encoding='utf-8')
soup = BeautifulSoup(fp,'html.parser')
res0 = str(soup.find('td', class_="tableheadbkgr"))
res0 = remove(res0)
s0 = re.findall(finds0, res0)
#print(s0)
data.append(s0)
res = soup.find_all('table',class_="tablenodeviewcontainer")
for tr in res:
tr = str(tr)
tr = remove(tr)
s1 = re.findall(finds1,tr)
s1 = [[i,] for i in s1]
if len(s1) == 8:
for i in range(len(s1)):
data.append(s1