一、LDA实现和库函数总结
LDA-C:David Blei,C实现,VBEM参数估计:
http://www.cs.princeton.edu/~blei/lda-c/index.html
GibbsLDA++/JGibbLDA :C/C++实现/Java实现:
http://gibbslda.sourceforge.net/
http://jgibblda.sourceforge.net
Scikit-learn: sklearn.decomposition.LatentDirichletAllocation/OnlineVB
常用正则表达式:
二、样例
链家数据获取:
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import sys
import csv
reload(sys)
sys.setdefaultencoding('utf-8')
def not_empty(str):
return str and str.strip()
if __name__ == '__main__':
url_main = 'http://bj.lianjia.com'
f = open(u'北京二手房.csv', 'wb')
f.write(unicode('\xEF\xBB\xBF', 'utf-8')) # 文件头
writer = csv.writer(f)
writer.writerow(['区域', '小区名称', '户型', '面积', '价格(万)', '单价(元/平米)',
'性质', '朝向', '装修', '是否有电梯', '楼层', '建筑年代', '楼型'])
res = requests.get('http://bj.lianjia.com/ershoufang')
res = res.text.encode(res.encoding).decode('utf-8')
soup = BeautifulSoup(res, 'html.parser')
# print soup.prettify()
districts = soup.find(name='div', attrs={'data-role':'ershoufang'}) # <div data-role="ershoufang">
# soup.select()
for district in districts.find_all(name='a'):
print district['title']
district_name = district.text # '东城', '西城', '朝阳', '海淀'......
url = '%s%s' % (url_main, district['href'])
# print url
res = requests.get(url)
res = res.text.encode(res.encoding).decode('utf-8')
soup = BeautifulSoup(res,'html.parser')
# print soup.prettify()
page = soup.find('div', {'class':'page-box house-lst-page-box'})
if not page: # 平谷区没有房源,直接返回
continue
total_pages = dict(eval(page['page-data']))['totalPage'] # 总页数
# print total_pages
for j in range(1, total_pages+1):
url_page = '%spg%d/' % (url, j)
res = requests.get(url_page)
res = res.text.encode(res.encoding).decode('utf-8')
soup = BeautifulSoup(res, 'html.parser')
# print soup.prettify()
sells = soup.find(name='ul', attrs={'class':'sellListContent', 'log-mod':'list'})
if not sells:
continue
# <a class="title" data-bl="list" data-el="ershoufang" data-log_index="1" href="XX" target="_blank">
titles = soup.find_all(name='a', attrs={'class':'title', 'data-bl':'list', 'data-el':'ershoufang'})
# <a data-el="region" data-log_index="1" href="X" target="_blank">
regions = sells.find_all(name='a', attrs={'data-el':'region'})
infos = sells.find_all(name='div', class_='houseInfo') # <div class="houseInfo">
infos2 = sells.find_all(name='div', class_='positionInfo') # <div class="positionInfo">
prices = sells.find_all(name='div', class_='totalPrice') # <div class="totalPrice">
unit_prices = sells.find_all(name='div', class_='unitPrice') # <div class="unitPrice" data-hid="X" data-price="X" data-rid="X">
subways = sells.find_all(name='span', class_='subway') # <span class="subway">
taxs = sells.find_all(name='span', class_='taxfree') # <span class="taxfree">
N = max(len(titles), len(regions), len(prices), len(unit_prices), len(subways), len(taxs), len(infos), len(infos2))
# for title, region, price, unit_price, subway, tax, info, info2 in zip(titles, regions, prices, unit_prices, subways, taxs, infos, infos2):
for i in range(N):
room_type = area = orientation = decoration = elevator = floor = year = slab_tower = None
title = titles[i] if len(titles) > i else None
region = regions[i] if len(regions) > i else None
price = prices[i] if len(prices) > i else None
unit_price = unit_prices[i] if len(unit_prices) > i else None
subway = subways[i] if len(subways) > i else None
tax = taxs[i] if len(taxs) > i else None
info = infos[i] if len(infos) > i else None
info2 = infos2[i] if len(infos2) > i else None
if title:
print 'Title: ', title.text
if region:
region = region.text
if price:
price = price.text
price = price[:price.find('万')]
if unit_price:
unit_price = unit_price.span.text.strip()
unit_price = unit_price[:unit_price.find('元/平米')]
if unit_price.find('单价') != -1:
unit_price = unit_price[2:]
if subway:
subway = subway.text.strip()
if tax:
tax = tax.text.strip()
if info:
info = info.text.split('|')
room_type = info[1].strip() # 几室几厅
area = info[2].strip() # 房屋面积
area = area[:area.find('平米')]
orientation = info[3].strip().replace(' ', '') # 朝向
decoration = '-'
if len(info) > 4: # 如果是车位,则该项为空
decoration = info[4].strip() # 装修类型:简装、中装、精装、豪装、其他
elevator = '无'
if len(info) > 5:
elevator = info[5].strip() # 是否有电梯:有、无
if info2:
info2 = filter(not_empty, info2.text.split(' '))
floor = info2[0].strip()
info2 = info2[1]
year = info2[:info2.find('年')]
slab_tower = info2[info2.find('建')+1:]
print district_name, region, room_type, area, price, unit_price, tax, orientation, decoration, elevator, floor, year, slab_tower
writer.writerow([district_name, region, room_type, area, price, unit_price, tax, orientation, decoration, elevator, floor, year, slab_tower])
# break
# break
# break
f.close()
LDA样例:
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from gensim import corpora, models, similarities
from pprint import pprint
import time
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def load_stopword():
f_stop = open('stopword.txt')
sw = [line.strip() for line in f_stop]
f_stop.close()
return sw
if __name__ == '__main__':
print '初始化停止词列表 --'
t_start = time.time()
stop_words = load_stopword()
print '开始读入语料数据 -- '
f = open('news.dat') #LDA_test.txt
texts = [[word for word in line.strip().lower().split() if word not in stop_words] for line in f]
# texts = [line.strip().split() for line in f]
print '读入语料数据完成,用时%.3f秒' % (time.time() - t_start)
f.close()
M = len(texts)
print '文本数目:%d个' % M
# pprint(texts)
print '正在建立词典 --'
dictionary = corpora.Dictionary(texts)
V = len(dictionary)
print u'词的个数:', V
print '正在计算文本向量 --'
corpus = [dictionary.doc2bow(text) for text in texts]
print '正在计算文档TF-IDF --'
t_start = time.time()
corpus_tfidf = models.TfidfModel(corpus)[corpus]
print '建立文档TF-IDF完成,用时%.3f秒' % (time.time() - t_start)
print 'LDA模型拟合推断 --'
num_topics = 10
t_start = time.time()
lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
alpha=0.01, eta=0.01, minimum_probability=0.001,
update_every = 1, chunksize = 100, passes = 1)
print 'LDA模型完成,训练时间为\t%.3f秒' % (time.time() - t_start)
# # 所有文档的主题
# doc_topic = [a for a in lda[corpus_tfidf]]
# print 'Document-Topic:\n'
# pprint(doc_topic)
# 随机打印某10个文档的主题
num_show_topic = 10 # 每个文档显示前几个主题
print '10个文档的主题分布:'
doc_topics = lda.get_document_topics(corpus_tfidf) # 所有文档的主题分布
idx = np.arange(M)
np.random.shuffle(idx)
idx = idx[:10]
for i in idx:
topic = np.array(doc_topics[i])
print 'topic = \t', topic
topic_distribute = np.array(topic[:, 1])
# print topic_distribute
topic_idx = topic_distribute.argsort()[:-num_show_topic-1:-1]
print ('第%d个文档的前%d个主题:' % (i, num_show_topic)), topic_idx
print topic_distribute[topic_idx]
num_show_term = 7 # 每个主题显示几个词
print '每个主题的词分布:'
for topic_id in range(num_topics):
print '主题#%d:\t' % topic_id
term_distribute_all = lda.get_topic_terms(topicid=topic_id)
term_distribute = term_distribute_all[:num_show_term]
term_distribute = np.array(term_distribute)
term_id = term_distribute[:, 0].astype(np.int)
print '词:\t',
for t in term_id:
print dictionary.id2token[t],
print
# print '\n概率:\t', term_distribute[:, 1]