爬取去哪儿网北京南站驴友点评，及词云

最新推荐文章于 2024-10-16 20:12:04 发布

Yeoman92

最新推荐文章于 2024-10-16 20:12:04 发布

阅读量703

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/Yeoman92/article/details/81198439

版权

爬虫专栏收录该内容

6 篇文章 0 订阅

订阅专栏

爬取页面截图

这里写图片描述

词云效果

title

这里写图片描述

comment

这里写图片描述

代码

数据抓取

# -*- encoding:utf-8 *-*
import urllib.request
from lxml import etree
import os

#获取页面
def get_page(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    return html

# 获得评论页面数据
def get_data_comment(html):
    selector = etree.HTML(html)
    #标题
    str_title = selector.xpath('//div[@class="comment_title"]/h2/text()')[0]
    #评论
    list_comment = selector.xpath('//div[@class="comment_content"]/p/text()')
    str_comment = ''
    for comment in list_comment:
        str_comment += comment + '  '
    print(str_title)
    return str_title.replace('\n', ''), str_comment.replace('\n', '')

#把数据写到本地（做词云用）
def write_data_wc(item_name, write_str):
    print(write_str)
    path_file = "./data/wc/" + item_name + ".txt"
    with open(path_file, 'w', encoding='utf8') as file:
        file.write(write_str)

# 把数据写到本地
def write_data(write_str):
    path_file = "./data/data.txt"
    with open(path_file, 'a', encoding='utf8') as file:
        file.write(write_str)

#抓取页面
def craw(root_url):
    # 如果文件存在，则删除
    path_file = "./data/data.txt"
    if os.path.exists(path_file):
        os.remove(path_file)

    html = get_page(root_url)
    selector = etree.HTML(html)
    #获取总页数
    str_total_num = selector.xpath('//div[@class="b_paging"]/a[last()-1]/text()')[0]
    total_num = int(str_total_num)

    #拼接每一页的url
    url_front = 'http://travel.qunar.com/p-oi5420182-beijingnanzhan-1-'
    list_url = []
    for i in range(1, total_num + 1):
        list_url.append(url_front + str(i))

    #获取所有页的评论url
    lsit_url_comment_page = []
    for url in list_url:
        html = get_page(url)
        selector = etree.HTML(html)
        lsit_url_comment_pre_page = selector.xpath('//ul[@id="comment_box"]/li/div[@class="e_comment_main"]//div[@class="e_comment_title"]/a/@href')
        for index, url_comment in enumerate(lsit_url_comment_pre_page):
            lsit_url_comment_page.append(url_comment)

    # print(lsit_url_comment_page)
    #获取评论信息
    str_title = ''
    str_comment = ''
    for url in lsit_url_comment_page:
        print(url)
        html = get_page(url)
        str_title_pre, str_comment_pre = get_data_comment(html)
        # 把评论信息保存到本地
        str_write = '@@title@@' + '\t\t' + str_title_pre + '\n' + '@@comment@@' + '\t\t' + str_comment_pre + '\n' + '--------------------------------------------' + '\n'
        write_data(str_write)

        str_title += str_title_pre + '  '
        str_comment += str_comment_pre + '  '
    #把评论信息保存到本地（做云词用）
    write_data_wc('title', str_title.replace('\n', '').replace(' ', ''))
    write_data_wc('comment', str_comment.replace('\n', ''))

if __name__ == '__main__':
    craw(root_url = 'http://travel.qunar.com/p-oi5420182-beijingnanzhan')

词云

# -*- coding: utf-8 -*-
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import jieba

#读取本地文件
def load_txt(item_name):
    with open('./data/wc/' + item_name +'.txt', 'r', encoding='utf8') as file_item:
        str_item = file_item.read()
    return str_item

#分词
def fenci(str_text):
    seg_list = list(jieba.cut(str_text, cut_all=True))
    return seg_list

#关键词统计
def count_keywords(item_name):
    str_item = load_txt(item_name)
    list_keywords = fenci(str_item)

    dict_keywords_item = {}
    for keyword in list_keywords:
        if len(keyword) > 1:
            if keyword not in dict_keywords_item:
                dict_keywords_item[keyword] = 1
            else:
                dict_keywords_item[keyword] += 1
    if '' in dict_keywords_item:
        del dict_keywords_item['']

    return dict_keywords_item

#词云
def wordcloud(item_name, mask_img):
    dict_keywords_item = count_keywords(item_name)
    image = Image.open("./img/mask/" + mask_img)
    graph = np.array(image)
    wc = WordCloud(font_path='./fonts/MSYH.TTC', background_color="black", max_words=50, mask=graph,
                   stopwords=set(STOPWORDS))
    wc.generate_from_frequencies(dict_keywords_item)
    # plot and show
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.figure()
    plt.imshow(graph, cmap=plt.cm.gray, interpolation='bilinear')
    plt.axis("off")
    # plt.show()

    # store to file
    wc.to_file(path.join("./img/" + item_name + '.png'))

#####方法调用
if __name__ == '__main__':
    wordcloud('title', 'mask.png')
    wordcloud('comment', 'mask.png')