python 热词分析_Python 爬取热词并进行分类数据分析-[简单准备] （2020年寒假小目标05）...-CSDN博客

日期：2020.01.27

博客期：135

星期一

【本博客的代码如若要使用，请在下方评论区留言，之后再用(就是跟我说一声)】

今天问了一下老师，信息领域热词从哪里爬，老师说是IT方面的新闻，嗯~有点儿意思了！

我找到了好多IT网站，但是大多数广告又多，名词也不专一针对信息领域，所以啊我就暂且用例一个相对还好的例子：

数据来源网址：https://news.51cto.com/(最终不一定使用此网站的爬取数据)

网站的相关热词来源截图：

如图，“智能”、“技术”、“区块链”为爬取目标

进行爬取(因为每一次执行js都会加重爬取任务的负担)，当你执行到第100次的时候，你现在要执行第101次的JS，它所消耗的时间大概是27s！所以，这种方法我就爬100次，得到5607条数据：

爬取代码：

1 importparsel2 from urllib importrequest3 importcodecs4 from selenium importwebdriver5 importtime6

7 #[ 对字符串的特殊处理方法-集合 ]

8 classStrSpecialDealer:9 @staticmethod10 defgetReaction(stri):11 strs = str(stri).replace(" ","")12 strs = strs[strs.find(‘>‘)+1:strs.rfind(‘

18 classStringWriter:19 filePath = ""

20 def __init__(self,str):21 self.filePath =str22 pass

24 defmakeFileNull(self):25 f = codecs.open(self.filePath, "w+", ‘utf-8‘)26 f.write("")27 f.close()28

29 defwrite(self,stri):30 f = codecs.open(self.filePath, "a+", ‘utf-8‘)31 f.write(stri + "\n")32 f.close()33

35 #[ 连续网页爬取的对象 ]

36 classWebConnector:37 profile = ""

38 sw = ""

39 #---[定义构造方法]

40 def __init__(self):41 self.profile =webdriver.Firefox()42 self.profile.get(‘https://news.51cto.com/‘)43 self.sw = StringWriter("../testFile/info.txt")44 self.sw.makeFileNull()45

46 #---[定义释放方法]

47 def __close__(self):48 self.profile.quit()49

50 #获取 url 的内部 HTML 代码

51 defgetHTMLText(self):52 a =self.profile.page_source53 returna54

55 #获取页面内的基本链接

56 defgetFirstChanel(self):57 index_html =self.getHTMLText()58 index_sel =parsel.Selector(index_html)59 links = index_sel.css(‘.tag‘).extract()60 num = links.__len__()61 print("Len="+str(num))62 for i inrange(0,num):63 tpl =StrSpecialDealer.getReaction(links[i])64 self.sw.write(tpl)65

66 defgetMore(self):67 self.profile.find_element_by_css_selector(".listsmore").click()68 time.sleep(1)69

70 defmain():71 wc =WebConnector()72 for i in range(0,100):73 print(i)74 wc.getMore()75 wc.getFirstChanel()76 wc.__close__()77

79 main()

Director.py

之后再使用MapReduce进行次数统计，就可以了(还可以配合维基百科和百度百科获取(爬取)相关热词的其他信息)

然后是词频统计(因为测试用，数据量不大，就写了简单的Python词频统计程序)：

1 importcodecs2

4 classStringWriter:5 filePath = ""

7 def __init__(self,str):8 self.filePath =str9 pass

11 defmakeFileNull(self):12 f = codecs.open(self.filePath, "w+", ‘utf-8‘)13 f.write("")14 f.close()15

16 defwrite(self,stri):17 f = codecs.open(self.filePath, "a+", ‘utf-8‘)18 f.write(stri + "\n")19 f.close()20

22 classMulti:23 filePath = ""

25 def __init__(self, filepath):26 self.filePath =filepath27 pass

29 defread(self):30 fw = open(self.filePath, mode=‘r‘, encoding=‘utf-8‘)31 tmp =fw.readlines()32 returntmp33

35 classBean :36 name = ""

37 num =038

39 def __init__(self,name,num):40 self.name =name41 self.num =num42

43 def __addOne__(self):44 self.num = self.num + 1

46 def __toString__(self):47 return self.name+"\t"+str(self.num)48

49 def __isName__(self,str):50 if str==self.name:51 returnTrue52 else:53 returnFalse54

56 classBeanGroup:57 data =[]58

59 def __init__(self):60 self.data =[]61

62 def __exist__(self, str):63 num = self.data.__len__()64 for i inrange(0, num):65 if self.data[i].__isName__(str):66 returnTrue67 returnFalse68

69 def __addItem__(self,str):70 #存在

71 if self.__exist__(str):72 num = self.data.__len__()73 for i inrange(0, num):74 if self.data[i].__isName__(str):75 self.data[i].__addOne__()76 #不存在

77 else:78 self.data.append(Bean(str,1))79

80 def __len__(self):81 return self.data.__len__()82

84 deftakenum(ele):85 returnele.num86

88 defmain():89 sw = StringWriter("../testFile/output.txt")90 sw.makeFileNull()91 bg =BeanGroup()92 m = Multi("../testFile/info.txt")93 lines =m.read()94 num = lines.__len__()95 for i inrange(0,num):96 strs = str(lines[i]).replace("\n","").replace("\r","")97 bg.__addItem__(strs)98 bg.data.sort(key=takenum,reverse=True)99 nums = bg.__len__()100 for i inrange(0,nums):101 sw.write(str(bg.data[i].__toString__()))102

103

104 main()

Multi.py

统计结果如下：