python热词关系图_Python 爬取热词并进行分类数据分析-[拓扑数据]

最新推荐文章于 2024-04-24 14:25:04 发布

weixin_39890633

最新推荐文章于 2024-04-24 14:25:04 发布

阅读量377

点赞数

文章标签： python热词关系图

日期：2020.01.29

博客期：137

星期三

【本博客的代码如若要使用，请在下方评论区留言，之后再用(就是跟我说一声)】

所有相关跳转：

a.【简单准备】

c.【拓扑数据】(本期博客)

d.【数据修复】

i . 【App制作】

j . 【安全性改造】

嗯，先声明一下 “拓扑数据”的意思，应老师需求，我们需要将热词的解释、引用等数据从百科网站中爬取下来，之后将统一的热词数据进行文件处理，组合成新的数据表，然后可以在网页上(暂时是网页)展示更多的信息。

嗯，可以对热词解释进行爬取了，给大家看一下　　(以人工智能为例)

我发现了一个问题：

setAttr("value","人工智能")方法并不能实现input的value属性值变为想要的“人工智能”，我采用的是sendKeys("人工智能")方法来实现，不过这样又有了一个问题，每一次sendKeys()相当于再input内部又附加了这样的字符，比如原本input里有“茄子”字样，之后使用sendKeys(“蔬菜”)，input里就变成了“茄子蔬菜”！这个问题就导致了我不能实现页面直接跳转。如何解决呢？

我从它的方法里找到了clear()方法，亲测可用(在sendKeys之前使用)。

我在这里提供测试类代码：

1 importparsel2 from urllib importrequest3 importcodecs4 from selenium importwebdriver5 importtime6

8 #[ 对字符串的特殊处理方法-集合 ]

9 classStrSpecialDealer:10 #取得当前标签内的文本

11 @staticmethod12 defgetReaction(stri):13 strs =StrSpecialDealer.simpleDeal(str(stri))14 strs = strs[strs.find('>')+1:strs.rfind('

17 #去除基本的分隔符

18 @staticmethod19 defsimpleDeal(stri):20 strs = str(stri).replace(" ", "")21 strs = strs.replace("\t", "")22 strs = strs.replace("\r", "")23 strs = strs.replace("\n", "")24 returnstrs25

26 #删除所有标签标记

27 @staticmethod28 defdeleteRe(stri):29 strs =str(stri)30 st = strs.find('')+1]33 strs = strs.replace(str_delete,"")34 st = strs.find('

36 returnstrs37

38 #删除带有日期的句子

39 @staticmethod40 defde_date(stri):41 lines = str(stri).split("。")42 strs = ""

43 num = lines.__len__()44 for i inrange(0,num):45 st =str(lines[i])46 if (st.__contains__("年") | st.__contains__("月")):47 pass

48 else:49 strs += st + "。"

50 strs = strs.replace("。。", "。")51 returnstrs52

53 #取得带有日期的句子之前的句子

54 @staticmethod55 defut_date(stri):56 lines = str(stri).split("。")57 strs = ""

58 num = lines.__len__()59 for i inrange(0, num):60 st =str(lines[i])61 if (st.__contains__("年")| st.__contains__("月")):62 break

63 else:64 strs += st + "。"

65 strs = strs.replace("。。","。")66 returnstrs67

68 @staticmethod69 defbeat(stri,num):70 strs =str(stri)71 for i inrange(0,num):72 strs = strs.replace("["+str(i)+"]","")73

74 returnstrs75

77 #[ 连续网页爬取的对象 ]

78 classWebConnector:79 profile = ""

80 sw = ""

82 #---[定义构造方法]

83 def __init__(self):84 self.profile =webdriver.Firefox()85 self.profile.get('https://baike.baidu.com/')86

87 #---[定义释放方法]

88 def __close__(self):89 self.profile.quit()90

91 #获取 url 的内部 HTML 代码

92 defgetHTMLText(self):93 a =self.profile.page_source94 returna95

96 #获取页面内的基本链接

97 defgetFirstChanel(self):98 index_html =self.getHTMLText()99 index_sel =parsel.Selector(index_html)100 links = index_sel.css('.lemma-summary').extract()[0]101 tpl =StrSpecialDealer.simpleDeal(str(links))102 tpl = StrSpecialDealer.beat(tpl,20)103 tpl =StrSpecialDealer.deleteRe(tpl)104 tpl =StrSpecialDealer.ut_date(tpl)105 returntpl106

107 defgetMore(self,refers):108 self.profile.find_element_by_id("query").clear()109 self.profile.find_element_by_id("query").send_keys(refers)110 self.profile.find_element_by_id("search").click()111 time.sleep(1)112

113

114 defmain():115 wc =WebConnector()116 wc.getMore("人工智能")117 s =wc.getFirstChanel()118 print(s)119 wc.getMore("5G")120 t =wc.getFirstChanel()121 print(t)122 wc.__close__()123

124

125 main()

test.py

嗯，然后我继续整合，将数据导入成文件批处理

对应代码：

1 importparsel2 from urllib importrequest3 importcodecs4 from selenium importwebdriver5 importtime6

8 #[ 整理后的数据 ]

9 classInfo:10

11 #---[ 方法区 ]

12 #构造方法

13 def __init__(self,name,num,more):14 self.name =name15 self.num =num16 self.more =more17

18 def __toString__(self):19 return (self.name+"\t"+str(self.num)+"\t"+self.more)20

21 def __toSql__(self,table):22 return ("Insert into"+table+"values ('"+self.name+"',"+self.num+",'"+self.more+"');")23

24 #---[ 数据区 ]

25 #名称

26 name = ""

27 #频数

28 num =029 #中文解释

30 more =031

33 #[写文件的方法集合]

34 classFileToWebAndContent:35

36 fileReaderPath = ""

37 wc = ""

38 sw = ""

40 def __init__(self,r,w):41 self.fileReaderPath =r42 self.wc =WebConnector()43 self.sw =StringWriter(w)44 self.sw.makeFileNull()45

46 def __free__(self):47 self.wc.__close__()48

49 def __deal__(self):50 fw = open(self.fileReaderPath, mode='r', encoding='utf-8')51 lines =fw.readlines()52 num = lines.__len__()53 for i inrange(0,num):54 str_line =lines[i]55 gr = str_line.split("\t")56 name_b =StrSpecialDealer.simpleDeal(gr[0])57 num_b = StrSpecialDealer.simpleDeal(gr[1])58 if(int(num_b)<=2):59 break

60 self.wc.getMore(name_b)61 more_b =self.wc.getFirstChanel()62 if(more_b==""):63 continue

64 info =Info(name_b,num_b,more_b)65 self.sw.write(info.__toString__())66

68 #[ 对字符串的特殊处理方法-集合 ]

69 classStrSpecialDealer:70 #取得当前标签内的文本

71 @staticmethod72 defgetReaction(stri):73 strs =StrSpecialDealer.simpleDeal(str(stri))74 strs = strs[strs.find('>')+1:strs.rfind('

77 #去除基本的分隔符

78 @staticmethod79 defsimpleDeal(stri):80 strs = str(stri).replace(" ", "")81 strs = strs.replace("\t", "")82 strs = strs.replace("\r", "")83 strs = strs.replace("\n", "")84 returnstrs85

86 #删除所有标签标记

87 @staticmethod88 defdeleteRe(stri):89 strs =str(stri)90 st = strs.find('')+1]93 strs = strs.replace(str_delete,"")94 st = strs.find('

96 returnstrs97

98 #删除带有日期的句子

99 @staticmethod100 defde_date(stri):101 lines = str(stri).split("。")102 strs = ""

103 num = lines.__len__()104 for i inrange(0,num):105 st =str(lines[i])106 if (st.__contains__("年") | st.__contains__("月")):107 pass

108 else:109 strs += st + "。"

110 strs = strs.replace("。。", "。")111 returnstrs112

113 #取得带有日期的句子之前的句子

114 @staticmethod115 defut_date(stri):116 lines = str(stri).split("。")117 strs = ""

118 num = lines.__len__()119 for i inrange(0, num):120 st =str(lines[i])121 if (st.__contains__("年")| st.__contains__("月")):122 break

123 else:124 strs += st + "。"

125 strs = strs.replace("。。","。")126 returnstrs127

128 @staticmethod129 defbeat(stri,num):130 strs =str(stri)131 for i inrange(0,num):132 strs = strs.replace("["+str(i)+"]","")133

134 returnstrs135

136

137 #[写文件的方法集合]

138 classStringWriter:139 filePath = ""

140

141 def __init__(self,str):142 self.filePath =str143 pass

144

145 defmakeFileNull(self):146 f = codecs.open(self.filePath, "w+", 'utf-8')147 f.write("")148 f.close()149

150 defwrite(self,stri):151 f = codecs.open(self.filePath, "a+", 'utf-8')152 f.write(stri + "\n")153 f.close()154

155

156 #[ 连续网页爬取的对象 ]

157 classWebConnector:158 profile = ""

159 sw = ""

160

161 #---[定义构造方法]

162 def __init__(self):163 self.profile =webdriver.Firefox()164 self.profile.get('https://baike.baidu.com/')165 #self.sw = StringWriter("../testFile/rc/moreinfo.txt")

166 #self.sw.makeFileNull()

167

168 #---[定义释放方法]

169 def __close__(self):170 self.profile.quit()171

172 #获取 url 的内部 HTML 代码

173 defgetHTMLText(self):174 a =self.profile.page_source175 returna176

177 #获取页面内的基本链接

178 defgetFirstChanel(self):179 try:180 index_html =self.getHTMLText()181 index_sel =parsel.Selector(index_html)182 links = index_sel.css('.lemma-summary').extract()[0]183 tpl =StrSpecialDealer.simpleDeal(str(links))184 tpl = StrSpecialDealer.beat(tpl, 20)185 tpl =StrSpecialDealer.deleteRe(tpl)186 tpl =StrSpecialDealer.ut_date(tpl)187 returntpl188 except:189 return ""

190

191 defgetMore(self,refers):192 self.profile.find_element_by_id("query").clear()193 self.profile.find_element_by_id("query").send_keys(refers)194 self.profile.find_element_by_id("search").click()195 time.sleep(1)196

197

198 defmain():199 ftwac = FileToWebAndContent("../testFile/rc/output.txt", "../testFile/rc/moreinfo.txt")200 ftwac.__deal__()201 ftwac.__free__()202

203

204 main()

MoreInfo.py

对应得到文件截图：

weixin_39890633

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python热词关系图_Python 爬取热词并进行分类数据分析-[拓扑数据]

日期：2020.01.29博客期：137星期三【本博客的代码如若要使用，请在下方评论区留言，之后再用(就是跟我说一声)】所有相关跳转：a.【简单准备】c.【拓扑数据】(本期博客)d.【数据修复】i . 【App制作】j . 【安全性改造】嗯，先声明一下 “拓扑数据”的意思，应老师需求，我们需要将热词的解释、引用等数据从百科网站中爬取下来，之后将统一的热词数据进行文件处理，组合成新的数据表，然后可以...
复制链接

扫一扫