情感本文分类的时候的,简单的将文本中的emoji表情换为了文字,勉强提升了一下准确率。
英文释意:http://www.oicqzone.com/tool/emoji/
中文释义:http://www.megaemoji.com/cn/emoji/
其实吧,直接三行代码就可以搞定~
三行神龙代码在结尾,先放一下正常的爬虫。
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame
import pprint
from lxml import etree
def get_table_from_html(html):
tree = etree.HTML(html)
# 寻找所有的table标签
table_lst = tree.xpath("//table")
table_data_lst = []
for table in table_lst:
table_data_lst.append(get_table(table))
return table_data_lst
def get_table(table_ele):
"""
获取table数据
:param table_ele:
:return:
"""
tr_lst = table_ele.xpath(".//tr")
# 第一行通常来说都是标题
title_data = get_title(tr_lst[0])
# 第一行后面都是数据
data = get_data(tr_lst[1:])
return {
'title': title_data,
'data': data
}
def get_title(tr_ele):
"""
获取标题
标题可能用th 标签,也可能用td标签
:param tr_ele:
:return:
"""
# 先寻找th标签
title_lst = get_tr_data_by_tag(tr_ele, 'th')
if not title_lst:
title_lst = get_tr_data_by_tag(tr_ele, 'td')
return title_lst
def get_data(tr_lst):
"""
获取数据
:param tr_lst:
:return:
"""
datas = []
for tr in tr_lst:
tr_data = get_tr_data_by_tag(tr, 'td')
datas.append(tr_data)
return datas
def get_tr_data_by_tag(tr, tag):
"""
获取一行数据
:param tr:
:param tag:
:return:
"""
datas = []
nodes = tr.xpath(".//{tag}".format(tag=tag))
for node in nodes:
text = node.xpath('string(.)').strip()
datas.append(text)
return datas
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
return res.text
def run(url):
html = get_html(url)
table_lst = get_table_from_html(html)# 结构是比较简单的
ans = []
for i in range(len(table_lst[0]['data'])):
ans.append([table_lst[0]['data'][i][7], table_lst[0]['data'][i][1]])
return ans
if __name__ == '__main__':
url = 'http://www.oicqzone.com/tool/emoji/'
data = run(url)
# data.to_csv()
print(data)
其实吧,直接三行代码就可以搞定~
import pandas as pd
url='http://www.oicqzone.com/tool/emoji/'
data = pd.read_html(url)[0][1:]
直接获得网站上的表情,刺激~。难怪都说python是爬虫的最佳语言