本文封装的实例下载:https://download.csdn.net/download/qq_19741181/10279675
py抓取知网论文摘要
-----------------------------------------------------------
文章主要参考:http://blog.csdn.net/Eastmount/article/details/78534119?locationNum=6&fps=1
-----------------------------------------------------------
(C:\ProgramData\Anaconda3) C:\Users\d>pip install --upgrade beautifulsoup4
Requirement already up-to-date: beautifulsoup4 in c:\programdata\anaconda3\lib\site-packages(C:\ProgramData\Anaconda3) C:\Users\d>
----跟着网上升级试了试 :参考:http://blog.csdn.net/sinat_26599509/article/details/50609646------------------
>>> import time
>>> import re
>>> import urllib
>>> import bs4
>>> from bs4 import BeautifulSoup
>>> if __name__ == '__main__':
... url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
... content = urllib.urlopen(url).read()
... soup = BeautifulSoup(content,"html.parser")
... wz_tab = soup.find_all("div",class_="wz_tab")
... num = 0
... for tab in wz_tab:
... title = tab.find("h3")
... print(title.get_text())
... urls = tab.find("h3").find_all("a")
... flag = 0
... for u in urls:
... if flag==0:
... print(u.get('href'))
... flag += 1
... abstract = tab.find(attrs={"class":"width715"}).get_text()
... print(abstract)
... other = tab.find(attrs={"class":"year-count"})
... content = other.get_text().split("\n")
... cb_from = other.find_all("span")
... flag = 0
... for u in cb_from:
... if flag==0:
... print(u.get("title"))
... flag += 1
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[0])
... print(number[0])
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[1])
... if len(number)=&