# -*- coding:utf-8 -*-
import os
import sys
import json
import time
import datetime
import xlsxwriter
from utils.twitter_client import ClientService
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
client_service = ClientService()
class TwitterService(object):
# 关注者
@staticmethod
def read_followers_by_screen_name(screen_name, cursor='-1', count=200):
top_parent_dir = os.path.join(os.path.dirname(__file__), 'static')
parent_dir = os.path.join(top_parent_dir, screen_name, 'followers')
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
dest_file_list = os.listdir(parent_dir)
if dest_file_list is not None and len(dest_file_list) > 0:
dest_file = sorted(dest_file_list, key=lambda k: k.split('_')[-1], reverse=True)[0]
with open(os.path.join(parent_dir, dest_file), 'r') as rf:
return rf.readline()
url = 'https://api.twitter.com/1.1/followers/list.json?screen_name=$screen_name&cursor=$cursor&count=$count&skip_status=true&include_user_entities=false'
url = url.replace('$screen_name', screen_name).replace('$cursor', str(cursor)).replace('$count', str(count))
client = client_service.get_client()
try:
resp, content = client.request(url, method='GET', body='', headers=None)
except Exception as exp:
print exp.message
result = json.loads(content)
if 'errors' in result:
client_service.remove_client(client)
client = client_service.get_client()
resp, content = client.request(url, method='GET', body='', headers=None)
dest_file = os.path.join(parent_dir, TwitterService.append_timestamp_suffix(screen_name + '_' + cursor))
with open(dest_file, 'w') as wf:
wf.write(str(content) + '\n')
return content
# 正在关注
@staticmethod
def read_followings_by_screen_name(screen_name, cursor='-1', count=200):
top_parent_dir = os.path.join(os.path.dirname(__file__), 'static')
parent_dir = os.path.join(top_parent_dir, screen_name, 'followings')
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
dest_file_list = os.listdir(parent_dir)
if dest_file_list is not None and len(dest_file_list) > 0:
dest_file = sorted(dest_file_list, key=lambda k: k.split('_')[-1], reverse=True)[0]
with open(os.path.join(parent_dir, dest_file), 'r') as rf:
return rf.readline()
url = 'https://api.twitter.com/1.1/friends/list.json?screen_name=$screen_name&cursor=$cursor&count=$count&skip_status=true&include_user_entities=false'
url = url.replace('$screen_name', screen_name).replace('$cursor', str(cursor)).replace('$count', str(count))
client = client_service.get_client()
try:
resp, content = client.request(url, method='GET', body='', headers=None)
except Exception as exp:
print exp.message
result = json.loads(content)
if 'errors' in result:
client_service.remove_client(client)
client = client_service.get_client()
resp, content = client.request(url, method='GET', body='', headers=None)
dest_file = os.path.join(parent_dir, TwitterService.append_timestamp_suffix(screen_name + '_' + cursor))
with open(dest_file, 'w') as wf:
wf.write(str(content) + '\n')
return content
# 推文
@staticmethod
def read_user_timeline_by_screen_name(screen_name, max_id=None, count=200):
top_parent_dir = os.path.join(os.path.dirname(__file__), 'static')
parent_dir = os.path.join(top_parent_dir, screen_name, 'timeline')
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
dest_file_list = os.listdir(parent_dir)
if dest_file_list is not None and len(dest_file_list) > 0:
dest_file = sorted(dest_file_list, key=lambda k: k.split('_')[-1], reverse=True)[0]
with open(os.path.join(parent_dir, dest_file), 'r') as rf:
return rf.readline()
url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=$screen_name&count=$count&exclude_replies=true'
if max_id is not None:
url = url + '&max_id=$max_id'
url = url.replace('$screen_name', screen_name).replace('$count', str(count)).replace('$max_id', str(max_id))
client = client_service.get_client()
try:
resp, content = client.request(url, method='GET', body='', headers=None)
except Exception as exp:
print exp.message
result = json.loads(content)
if 'errors' in result:
client_service.remove_client(client)
client = client_service.get_client()
resp, content = client.request(url, method='GET', body='', headers=None)
if content is not None and len(content) > 2:
dest_file = os.path.join(parent_dir, TwitterService.append_timestamp_suffix(
screen_name + '_' + '-1' if max_id is None else str(max_id)))
with open(dest_file, 'w') as wf:
wf.write(str(content) + '\n')
return content
# 推文
@staticmethod
def read_user_timeline_by_user_id(user_id, max_id=None, count=200):
url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?user_id=$user_id&count=$count&exclude_replies=true'
if max_id is not None:
url = url + '&max_id=$max_id'
url = url.replace('$user_id', user_id).replace('$count', str(count)).replace('$max_id', str(max_id))
client = client_service.get_client()
try:
resp, content = client.request(url, method='GET', body='', headers=None)
except Exception as exp:
print exp.message
return content
# 用户信息
@staticmethod
def read_user_info_by_screen_name(screen_name):
top_parent_dir = os.path.join(os.path.dirname(__file__), 'static')
parent_dir = os.path.join(top_parent_dir, screen_name, 'profile')
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
TwitterService.read_followers_by_screen_name(screen_name)
dest_file_list = os.listdir(parent_dir)
if dest_file_list is not None and len(dest_file_list) > 0:
dest_file = sorted(dest_file_list, key=lambda k: k.split('_')[-1], reverse=True)[0]
with open(os.path.join(parent_dir, dest_file), 'r') as rf:
return rf.readline()
url = 'https://api.twitter.com/1.1/users/show.json?screen_name=$screen_name'
url = url.replace('$screen_name', screen_name)
client = client_service.get_client()
try:
resp, content = client.request(url, method='GET', body='', headers=None)
except Exception as exp:
print exp.message
result = json.loads(content)
if 'errors' in result:
client_service.remove_client(client)
client = client_service.get_client()
resp, content = client.request(url, method='GET', body='', headers=None)
dest_file = os.path.join(parent_dir, TwitterService.append_timestamp_suffix(screen_name))
with open(dest_file, 'w') as wf:
wf.write(str(content) + '\n\n')
return content
@staticmethod
def append_timestamp_suffix(prefix):
return prefix + '_' + str(int(time.time()))
class TwitterRlTimeService(object):
@staticmethod
def read_user_rltime_followers(screen_name):
parent_dir = os.path.join(os.path.dirname(__file__), 'download')
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
filename = os.path.join(parent_dir, screen_name + '_followers.xlsx')
workbook = xlsxwriter.Workbook(filename=filename)
worksheet = workbook.add_worksheet(name='Sheet1')
url = 'https://api.twitter.com/1.1/followers/list.json?screen_name=$screen_name&cursor=$cursor&count=$count&skip_status=true&include_user_entities=false'
cursor, count, row_count = -1, 200, 0
while True:
t_url = url.replace('$screen_name', screen_name).replace('$cursor', str(cursor)).replace('$count', str(count))
client = client_service.get_client()
try:
resp, content = client.request(t_url, method='GET', body='', headers=None)
except Exception as exp:
print exp.message
result = json.loads(content)
if 'errors' in result:
client_service.remove_client(client)
client = client_service.get_client()
resp, content = client.request(t_url, method='GET', body='', headers=None)
result = json.loads(content)
if 'next_cursor' not in result:
break
if result['next_cursor'] == cursor:
break
friends = result['users']
for friend in friends:
create_at = str(friend['created_at'])
create_date = datetime.datetime.strptime(create_at, '%a %b %d %H:%M:%S +0000 %Y')
create_date_txt = create_date.strftime('%Y-%m-%d %H:%M:%S')
worksheet.write(row_count, 0, friend['name'])
worksheet.write(row_count, 1, friend['screen_name'])
worksheet.write(row_count, 2, str(friend['description']).strip().replace('\n', ''))
worksheet.write(row_count, 3, str(friend['location']))
worksheet.write(row_count, 4, friend['statuses_count'])
worksheet.write(row_count, 5, friend['friends_count'])
worksheet.write(row_count, 6, friend['followers_count'])
worksheet.write(row_count, 7, friend['favourites_count'])
worksheet.write(row_count, 8, create_date_txt)
row_count += 1
if result['next_cursor'] == 0:
break
cursor = result['next_cursor']
print cursor
workbook.close()
@staticmethod
def read_user_rltime_followings(screen_name):
parent_dir = os.path.join(os.path.dirname(__file__), 'download')
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
filename = os.path.join(parent_dir, screen_name + '_followings.xlsx')
workbook = xlsxwriter.Workbook(filename=filename)
worksheet = workbook.add_worksheet(name='Sheet1')
url = 'https://api.twitter.com/1.1/friends/list.json?screen_name=$screen_name&cursor=$cursor&count=$count&skip_status=true&include_user_entities=false'
cursor, count, row_count = -1, 200, 0
while True:
t_url = url.replace('$screen_name', screen_name).replace('$cursor', str(cursor)).replace('$count', str(count))
client = client_service.get_client()
try:
resp, content = client.request(t_url, method='GET', body='', headers=None)
except Exception as exp:
print exp.message
result = json.loads(content)
if 'errors' in result:
client_service.remove_client(client)
client = client_service.get_client()
try:
resp, content = client.request(t_url, method='GET', body='', headers=None)
except Exception as exp:
print exp.message
result = json.loads(content)
if 'next_cursor' not in result:
break
if result['next_cursor'] == cursor:
break
friends = result['users']
for friend in friends:
create_at = str(friend['created_at'])
create_date = datetime.datetime.strptime(create_at, '%a %b %d %H:%M:%S +0000 %Y')
create_date_txt = create_date.strftime('%Y-%m-%d %H:%M:%S')
worksheet.write(row_count, 0, friend['name'])
worksheet.write(row_count, 1, friend['screen_name'])
worksheet.write(row_count, 2, str(friend['description']).strip().replace('\n', ''))
worksheet.write(row_count, 3, str(friend['location']))
worksheet.write(row_count, 4, friend['statuses_count'])
worksheet.write(row_count, 5, friend['friends_count'])
worksheet.write(row_count, 6, friend['followers_count'])
worksheet.write(row_count, 7, friend['favourites_count'])
worksheet.write(row_count, 8, create_date_txt)
row_count += 1
if result['next_cursor'] == 0:
break
cursor = result['next_cursor']
print cursor
workbook.close()
# -*- coding:utf-8 -*-
import sys
import oauth2
import random
import threading
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
KEY_1 = ''
SECRET_1 = ''
CONSUMER_KEY_1 = ''
CONSUMER_SECRET_1 = ''
KEY_2 = ''
SECRET_2 = ''
CONSUMER_KEY_2 = ''
CONSUMER_SECRET_2 = ''
class ClientService(object):
client_1 = None
client_2 = None
client_pool = []
def __init__(self):
if self.client_1 is None:
consumer_1 = oauth2.Consumer(key=CONSUMER_KEY_1, secret=CONSUMER_SECRET_1)
token_1 = oauth2.Token(key=KEY_1, secret=SECRET_1)
self.client_1 = oauth2.Client(consumer=consumer_1, token=token_1)
if self.client_2 is None:
consumer_2 = oauth2.Consumer(key=CONSUMER_KEY_2, secret=CONSUMER_SECRET_2)
token_2 = oauth2.Token(key=KEY_2, secret=SECRET_2)
self.client_2 = oauth2.Client(consumer=consumer_2, token=token_2)
self.client_pool = [self.client_1, self.client_2]
def add_client(self, current_client):
self.client_pool.append(current_client)
def remove_client(self, current_client):
self.client_pool.remove(current_client)
timer = threading.Timer(900, self.add_client, (current_client,))
timer.start()
def get_client(self):
if self.client_pool is None or len(self.client_pool) == 0:
raise Exception('temporarily unavailable clients! rate limit exceeded')
return self.client_pool[random.randint(0, len(self.client_pool) - 1)]
DouBan
# -*- coding:utf-8 -*-
import os
import re
import sys
import json
import jieba
import requests
import pandas as pd
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
# HEADER
header = {
'Host': 'movie.douban.com',
'Referer': 'https://movie.douban.com/subject/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
# PROXY
proxies = [
{'http': '140.143.96.216:80', 'https': '140.143.96.216:80'},
{'http': '119.27.177.169:80', 'https': '119.27.177.169:80'},
{'http': '221.7.255.168:8080', 'https': '221.7.255.168:8080'}
]
def movie_recommend_demo_spider():
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0'
resp = requests.get(url)
if resp.ok:
result = json.loads(resp.content)
for subject in result['subjects']:
print subject['id']
print subject['title']
print subject['url']
print subject['rate']
print subject['is_new']
def movie_subject_demo_spider():
url = 'https://movie.douban.com/subject/26374197/'
resp = requests.get(url)
if resp.ok:
html = BeautifulSoup(resp.content, 'html.parser')
print html.select('div.rating_sum span')[0].text
for t in html.select('div.indent span'):
if t.has_attr('property'):
prop = t.get('property')
if prop == 'v:summary':
print str(t.text).strip()
for s_tag in html.select('script'):
if s_tag.has_attr('type'):
type_txt = s_tag.get('type')
if type_txt == 'application/ld+json':
info = json.loads(s_tag.text)
print info['name']
print info['director']
print info['author']
def movie_subject_comment_demo_spider():
s_url = 'https://movie.douban.com/subject/26374197/comments?start=$start&limit=20&sort=new_score&status=P'
t_url = 'https://movie.douban.com/subject/26374197/comments?start=$start&limit=20&sort=time&status=P'
resp = requests.get(t_url)
if resp.ok:
html = BeautifulSoup(resp.content, 'html.parser')
comment_div_tags = html.select('div.comment')
for comment_div_tag in comment_div_tags:
comment_id = comment_div_tag.select('h3 span.comment-vote input')[0].get('value')
comment_votes = comment_div_tag.select('h3 span.comment-vote span')[0].text
comment_user_tag = comment_div_tag.select('h3 span.comment-info a')[0]
comment_user_name = comment_user_tag.text
comment_user_profile = comment_user_tag.get('href')
# 力荐 5 推荐 4 还行 3 较差 2 很差 1
comment_user_rating_txt = comment_div_tag.select('h3 span.comment-info span')[1].get('title')
comment_user_rating = 5 if comment_user_rating_txt == '力荐' else 4 if comment_user_rating_txt == '推荐' else 3\
if comment_user_rating_txt == '还行' else 2 if comment_user_rating_txt == '较差' else 1
comment_time = comment_div_tag.select('h3 span.comment-info span.comment-time')[0].get('title')
comment_text = comment_div_tag.select('p span.short')[0].text
print comment_id
print comment_votes
print comment_user_name
print comment_user_profile
print comment_user_rating
print comment_time
print comment_text
def movie_comment_spider():
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0'
resp = requests.get(url)
if resp.ok:
result = json.loads(resp.content)
for subject in result['subjects']:
subject_id = subject['id']
subject_title = subject['title']
print '%s %s' % (subject_id, subject_title)
subject_url = subject['url']
subject_resp = requests.get(subject_url)
if subject_resp.ok:
html = BeautifulSoup(subject_resp.content, 'html.parser')
subject_comment_count = html.select('div.rating_sum span')[0].text
print subject_comment_count
subject_short_comment_count_txt = html.select('#comments-section div.mod-hd h2 a')[0].text
subject_short_comment_count = re.findall('\d+', subject_short_comment_count_txt)[0]
print subject_short_comment_count
movie_subject_comment_spider(subject_id, subject_title, int(subject_short_comment_count))
else:
print subject_resp.content
else:
print resp.content
def movie_subject_comment_spider(subject_id, subject_title, subject_short_comment_count):
t_url = 'https://movie.douban.com/subject/$subject_id/comments?start=$start&limit=20&sort=time&status=P'.replace('$subject_id', subject_id)
column1 = []
column2 = []
column3 = []
column4 = []
column5 = []
column6 = []
column7 = []
#for i in range((subject_short_comment_count / 20) + 1):
for i in range(10):
resp = requests.get(t_url.replace('$start', str(20 * i)), headers=header)
if resp.ok:
html = BeautifulSoup(resp.content, 'html.parser')
comment_div_tags = html.select('div.comment')
for comment_div_tag in comment_div_tags:
comment_id = comment_div_tag.select('h3 span.comment-vote input')[0].get('value')
comment_votes = comment_div_tag.select('h3 span.comment-vote span')[0].text
comment_user_tag = comment_div_tag.select('h3 span.comment-info a')[0]
comment_user_name = comment_user_tag.text
comment_user_profile = comment_user_tag.get('href')
# 力荐 5 推荐 4 还行 3 较差 2 很差 1
comment_user_rating_txt = comment_div_tag.select('h3 span.comment-info span')[1].get('title')
comment_user_rating = 5 if comment_user_rating_txt == '力荐' else 4 if comment_user_rating_txt == '推荐' else 3 \
if comment_user_rating_txt == '还行' else 2 if comment_user_rating_txt == '较差' else 1
comment_time = comment_div_tag.select('h3 span.comment-info span.comment-time')[0].get('title')
comment_text = comment_div_tag.select('p span.short')[0].text
column1.append(comment_id)
column2.append(comment_user_name)
column3.append(comment_user_profile)
column4.append(comment_user_rating)
column5.append(comment_votes)
column6.append(comment_time)
column7.append(str(comment_text).strip().replace(' ', '').replace('\n', '').replace('\r', ''))
df = pd.DataFrame({'id': column1, 'name': column2, 'profile': column3, 'rating': column4,\
'votes': column5, 'time': column6, 'text': column7})
df.to_csv('F:\\result\\tmp\\douban\\$subject_title.csv'.replace('$subject_title', subject_title), sep=',', na_rep='NA', index=False)
#df.to_csv('/home/ym/Project/datamining/resources/$subject_title.csv'.replace('$subject_title', subject_title), sep=',', na_rep='NA', index=False)
def movie_comment_analyze():
df = pd.read_csv('F:\\result\\tmp\\douban\\fengkuangwaixingren.csv', names={'id', 'name', 'profile', 'rating', 'text', 'time', 'votes'})
words = []
for content in df['text']:
words.extend(jieba.cut(content))
word_txt = ' '.join(words)
wc = WordCloud(background_color='white',
max_words=1000, # 最大词数
max_font_size=100, # 显示字体的最大值
#mask=back_color, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
width=1000,
height=800,
random_state=42, # 为每个词返回一个PIL颜色
font_path='F:/develop/python/stfangso/STFANGSO.TTF'
)
print word_txt
wc.generate(word_txt)
# 解析该图片
#back_color = imread('o_002.jpg')
# 基于彩色图像生成相应彩色
#image_colors = ImageColorGenerator(back_color)
# 显示图片
plt.imshow(wc)
# 关闭坐标轴
plt.axis('off')
plt.show()
#wc.to_file('F:/a.png')