"""
Created on May
@author: Administrator
"""
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from urllib.parse import quote
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}
def parse_cookieTodict(co):
codic={}
colist=co.replace(' ','').split(";")
for i in colist:
a=i.split("=")
codic[a[0]]=a[1]
return codic
def find_keyword(content):
a=BeautifulSoup(content,"html.parser")
result=[]
s=a.find_all(attrs={'class':'limit_width'})
if(s):
for k in s:
dic1={}
href=k.find_all("a")[0].get('href')
sk=str(k).replace("<em>","").replace("</em>","")
dic1['href']=href
if(re.findall("<a href.*>(.*)</a>",sk)):
dic1['title']=re.findall("<a href.*>(.*)</a>",sk)[0]
result.append(dic1)
return result
def get_text(url,co):
print()
session=requests.session()
c=session.get(url,cookies=co)
content=''
if c.status_code==200:
content=session.get(url,cookies=co).text.replace("\\","").replace(">n","").replace("rn","")
else:
content=False
return content
def getdata(begin,num,keyword,cookie):
co=parse_cookieTodict(cookie)
keyword=quote(keyword,encoding="utf-8")
allresult=[]
for a in range(num):
print(".....第{}页数据开始抓取....".format(a+1))
page=str(begin+a)
content=get_text("https://so.csdn.net/so/search/s.do?p={}&q={}&t=&viparticle=&domain=&o=&s=&u=&l=&f=".format(page,keyword),co)
if(content):
aa=find_keyword(content)
allresult.extend(aa)
print("抓取数据完成")
return pd.DataFrame(allresult)
'''
getdata(1,16,"数据分析",co)
第一个参数为从第几页开始,第二个参数是抓取多少页,第三个是搜索的关键词,第四个参数是浏览器cookie(登录csdn后,从请求头中复制过来即可)
'''
co='复制登录后的某dn的cookie'
aa=getdata(1,16,"数据分析",co)
复制cookie
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/83482a0c1f2e2687d6ee608fd5f66add.png)
获取连接的文本
co=‘自己复制浏览器cookie’
def get_articledata(data):
newdata=[]
co=parse_cookieTodict(co)
for a in range(len(data)):
html=get_text(data.iloc[a,0],co)
print("第{}篇文章".format(a+1))
if(html):
y={}
a=BeautifulSoup(html,"html.parser")
p=a.find(attrs={'id':'content_views'})
if(p):
text=p.children
else:
text=[]
continue
textrcount=a.find_all(attrs={'class':"read-count"})[0].string.replace("\n","").replace(" ","")
texttime=re.findall("<span class=\"time\">(.*)</span>",str(a.find_all(attrs={"class":"bar-content"})))[0]
textccount=a.find_all(attrs={'class':"get-collection"})[0].string.replace("\n","").replace(" ","")
stext=''
for atext in text:
if(len(atext)>0):
stext=stext+str(atext)
strinfo = re.compile('<.*?>')
stext= strinfo.sub('',stext)
stext=str(stext).replace("\xa0","").replace("\n","")
y['num_read']=textrcount
y['num_collection']=textccount
y['time']=texttime
y['text']=stext
newdata.append(y)
print("抓取完成")
return newdata
b=pd.DataFrame(aa.iloc[1:2])
dd=pd.DataFrame(get_articledata(aa))
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
import jieba
t=''
dd=pd.read_excel(r"D:\tade\csdn爬虫\数据分析.xlsx")
kk=dd.sort_values(by='num_read',ascending=False)
for a in range(len(kk)):
t=t+kk.iloc[a]['text']
cut_text = " ".join(jieba.cut(t))
wordcloud = WordCloud(
font_path="C:/Windows/Fonts/simfang.ttf",
background_color="white",width=1000,height=880).generate(cut_text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()