爬取数据
huputitle_spiders.py
#coding:utf-8
import scrapy
from huputitle.items import HuputitleItem
from scrapy.crawler import CrawlerProcess
class hupuSpider(scrapy.Spider):
name = 'huputitle'
allowed_domains = ["bbs.hupu.com"]
start_urls = ["https://bbs.hupu.com/bxj"]
def parse(self, response):
item = HuputitleItem()
item['titles'] = response.xpath('//a[@id=""]/text()').extract()#提取标题
# print 'titles',item['titles']
yield item
new_url = "https://bbs.hupu.com" + response.xpath('//a[@id="j_next"]/@href').extract_first()
if new_url:
yield scrapy.Request(new_url,callback=self.parse)
items.py
# -*- coding: utf-8 -*-
import scrapy
class HuputitleItem(scrapy.Item):
# define the fields for your item here like:
titles = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
import os
import urllib
from huputitle import settings
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
class HuputitlePipeline(object):
def process_item(self, item, spider):
for title in item['titles']:
# print 'title',title
fo = open("foo.txt", "a")
fo.write("".join(title)+"\r\n")
fo.close()
return item
settings.py
BOT_NAME = 'huputitle'
SPIDER_MODULES = ['huputitle.spiders']
NEWSPIDER_MODULE = 'huputitle.spiders'
ITEM_PIPELINES = {
'huputitle.pipelines.HuputitlePipeline': 1,
}
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
最终爬取了100页2W多个标题
分词并统计词的数量
这里我使用了 jieba 这个库来分词
hupudivide.py
#encoding=utf-8
import jieba
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
fo = open("hupu.txt", "r")
fi = open("hupudi.txt", "w")
lines = fo.readlines()
for line in lines:
seg_list = jieba.cut_for_search(line)
fi.write(" \n".join(seg_list))
分出了17w个词
然后统计数量
huPuCounter.py
#encoding=utf-8
import jieba
import jieba.analyse
import time
from collections import Counter
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
fo = open("hupudi.txt", "r")
fi = open("hupunum.txt", "w")
fl = open("hupunumword.txt", "w")
f = open("hupuword.txt", "w")
lines = fo.readlines()
d = {}
for line in lines:
if line not in d:
d[line] = 1
else:
d[line] = d[line] + 1
d = sorted(d.items(),key=lambda item:item[1],reverse=True)
for k in d:
fi.write("%s%d\n" % (k[0][:-1].encode('utf-8'),k[1]))
if len(k[0][:-1].encode('utf-8')) >= 6:
fl.write("%s%d\n" % (k[0][:-1].encode('utf-8'),k[1]))
f.write("%s" % (k[0][:-1].encode('utf-8')))
这里我统计了两个词以下和两个词以上的词的量分配如图
生成词云以及其他数据图表
makeHupuCloud.py
#encoding=utf-8
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
text_from_file_with_apath = open('foo.txt').read()
wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = False)
wl_space_split = " ".join(wordlist_after_jieba)
backgroud_Image = plt.imread('huputag.jpg')
my_wordcloud = WordCloud(background_color = 'white',
mask = backgroud_Image, ).generate(wl_space_split)
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()
这里我是用python的wordcloud库生成的词云,图片是hupu的logo
使用jieba的分词分出词性 生成的图表