先上图,如果不是需要的就可以不用往下看了。
首先是什么样的层级结构,如下图,有些长,不完整,完整的图太长上传不了,打开链接https://www.genome.jp/kegg/pathway.html查看。
转化后的表格
完整代码
import reimport lxmlimport requestsfrom bs4 import BeautifulSoupfrom fake_useragent import UserAgentclass kegg(object): def __init__(self): self.url = 'https://www.genome.jp/kegg/pathway.html' ua = UserAgent(verify_ssl=False) self.headers = {'User-Agent': ua.random} def get_html(self, url): response = requests.get(url, headers=self.headers) if response.status_code == 200: return response else: return None def keggPathway(self, url): if self.get_html(url): html = self.get_html(url).text soup = BeautifulSoup(html, "lxml") Kmain = soup.find(id="main") fp = [] sp = [] for i in Kmain: if re.findall(r"
(.*?)
", str(i)): fp.append(re.findall(r"
(.*?)
", str(i))) if re.findall(r"(.*?)", str(i)): sp.append(re.findall(r"(.*?)", str(i))) del sp[0] kt = Kmain.find_all(class_="list") print(len(fp), len(sp), len(kt)) tpw = [] for i in fp: for j, k in zip(sp, kt): if i[0][0] == j[0][0]: fs = re.sub(r'^[^A-Z]*', '', i[0]) + "\t" + re.sub( r'^[^A-Z]*', '', j[0]) ka = k.find_all('a') for l in ka: kau = l.get("href") kat = l.get_text() mapN = re.findall(r'\?(.*?)$', str(kau))[0] tpw.append(fs + "\t" + kat + "\t" + mapN) return tpw def main(self): tpw = self.keggPathway(self.url) with open("temptpw.tsv", "w") as f: for i in tpw: f.write(i + "\n")if __name__ == "__main__": kp = kegg() kp.main()
变量名起的很随意,上面代码理解起来不难,暂不解释。