python 爬虫
import requests
import os
import time
import threading
import pandas as pd
from bs4 import BeautifulSoup
import random
import re
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
url1 ='https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE154826'
r = requests.get(url1, headers=headers, timeout=60) # get the whole htlm infomation
soup1 = BeautifulSoup(r.text,'html.parser')
links = [i for i in soup1.find_all(href=re.compile('ftp:'))]
links = links[3:]
links_rep1 = [str(i)[9:] for i in links]
links_rep2 = [str(i)[:-12] for i in links_rep1]
pd.DataFrame(links_rep2).to_csv('GSE154826.csv')
转载于LGH(FJMU)
import requests
import os
import time
import threading
import pandas as pd
from bs4 import BeautifulSoup
import random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
##
def download_pdf(path,pmid,scihub_adress):
#pmid = 23480440
url1 = 'https://pubmed.ncbi.nlm.nih.gov/{}/'.format(pmid)
r = requests.get(url1, headers=headers, timeout=60) # get the whole htlm infomation
soup1 = BeautifulSoup(r.text,'html.parser') #
try:
DOI = soup1.find('span',class_='citation-doi').text.split(':')[1].strip() #[:-1]
# ===============================================&#