python课程学习代码：final实验-爬虫+制作词云-CSDN博客

本文链接：https://blog.csdn.net/weixin_52126591/article/details/136801068

python课程：将所学习的知识融会贯通

1.进行数据库设计
2.进行视频爬虫
3.将数据保存到数据库中
4.制作词云

一、数据库设计

create table movie(
id char(20) primary key,
name char(30) not null,
director char(100),
screenWriter char(100),
actor char(255),
category char(100),
district char(100),
language char(100),
showtime date,
length int,
othername char(100),
imdb char(20)
);
create table comment(
comment_id char(20) primary key,
comment_writer char(20),
comment_score char(5),
comment_date date,
comment_content text
);

二、获取电影

#timeformat工具类时间
ISOTIMEFORMAT='%Y-%m-%d %X'
#把获取的信息放到douban_movie.txt
outputFile = 'douban_movie.txt'
fw = open(outputFile, 'w')
fw.write('id;title;url;cover;rate\n')
# 伪造浏览头
headers = {}
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
headers["Accept-Encoding"] = "gzip, deflate, sdch"
headers["Accept-Language"] = "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4,ja;q=0.2"
headers["Connection"] = "keep-alive"
headers["Host"] = "movie.douban.com"
headers["Referer"] = "http://movie.douban.com/"
headers["Upgrade-Insecure-Requests"] = 1
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
#获取tag
request = urllib.Request(url="http://movie.douban.com/j/search_tags?type=movie")
response = urllib.urlopen(request)
tags = json.loads(response.read())['tags']
#开始爬取
print("********** START **********")
#根据区域设置格式化本地时间/日期   #localtime是 把从1970-1-1零点零分到当前时间系统所偏移的秒数时间转换为本地时间
print(time.strftime(ISOTIMEFORMAT, time.localtime()))
for tag in tags:
	print("Crawl movies with tag: " + tag)
	print(time.strftime(ISOTIMEFORMAT, time.localtime()))
	start = 0
	while True:
		url = "http://movie.douban.com/j/search_subjects?type=movie&tag=" + tag.encode('utf8') + "&page_limit=20&page_start=" + str(start)
		request = urllib.Request(url=url)
		response = urllib.urlopen(request)#urlopen()函数用于实现对目标url的访问
		movies = json.loads(response.read())['subjects']
		if len(movies) == 0:
			break
		for item in movies:
			rate = item['rate']
			title = item['title']#标题
			url = item['url']#链接
			cover = item['cover']#封面
			movieId = item['id']#电影id
			record = str(movieId) + '%%%' + title + '%%%' + url + '%%%' + cover + '%%%' + str(rate) + '%%%' + '\n'
			fw.write(record.encode('utf8'))
			print(tag + '\t' + title)
		start = start + 20
fw.close()

三、获取详细信息

inputFile = 'douban_movie.txt'
fr = open(inputFile, 'r')
outputFile = 'douban_movie_details.txt'
fw = open(outputFile, 'w')
fw.write(
    'id^title^url^cover^rate^director^composer^actor^category^district^language^showtime^length^othername^description\n')
headers = {}
# headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
# headers["Accept-Encoding"] = "gzip, deflate, sdch"
# headers["Accept-Language"] = "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4,ja;q=0.2"
# headers["Cache-Control"] = "max-age=0"
# headers["Connection"] = "keep-alive"
# headers["Cookie"] = 'asdasd13323efafasfa'
# headers["Host"] = "movie.douban.com"
# headers["Referer"] = "http://movie.douban.com/"
# headers["Upgrade-Insecure-Requests"] = 1
absolute = 'https://movie.douban.com/subject/26322642/comments'
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
firstLine = True
count = 1
errorCount = 0
result = {}
for line in fr:
    if firstLine:
        firstLine = False
        continue
    line = line.split(';')
    movieId = line[0]
    title = line[1]
    url = line[2]
    cover = line[3]
    rate = line[4].rstrip('\n')
    if result.has_key(movieId):
        continue
    else:
        result[str(movieId)] = 1
    try:
        request = urllib.Request(url=url, headers=headers)
        # request = urllib2.Request(url=url)
        response = urllib.urlopen(request)
        html = response.read()
        html = BeautifulSoup(html)
        info = html.select('#info')[0]
        info = info.get_text().split('\n')
        # 提取字段，只要冒号后面的文本内容
        # 用于把一个字符串分割成字符串数组 #用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列
        director = info[1].split(':')[-1].strip()#导演
        screenWriter= info[2].split(':')[-1].strip()#编剧
        actor = info[3].split(':')[-1].strip()#演员
        category = info[4].split(':')[-1].strip()#类别
        district = info[6].split(':')[-1].strip()#地区
        language = info[7].split(':')[-1].strip()#语言
        showtime = info[8].split(':')[-1].strip()#上映时间
        length = info[9].split(':')[-1].strip()#时长
        othername = info[10].split(':')[-1].strip()#别名
        # 电影简介
        description = html.find_all("span", attrs={"property": "v:summary"})[0].get_text()
        description = description.lstrip().lstrip('\n\t').rstrip().rstrip('\n\t').replace('\n', '\t')
        # 写入数据
        record = str(movieId) + '^' + title + '^' + url + '^' + cover + '^' + str(rate) + '^' + director.encode(
            'utf8') + '^' + screenWriter.encode('utf8') + '^' + actor.encode('utf8') + '^' + category.encode(
            'utf8') + '^' + district.encode('utf8') + '^' + language.encode('utf8') + '^' + showtime.encode(
            'utf8') + '^' + length.encode('utf8') + '^' + othername.encode('utf8') + '^' + description.encode(
            'utf8') + '\n'
        fw.write(record)
        print(count, title)
        time.sleep(5)
    except URLError as e:
        print(e)
        print(count, title, "Error")
        errorCount = errorCount + 1
    else:
        pass
    finally:
        pass
    count = count + 1
    print(count,errorCount)


# 获取评论内容和下一页链接
def get_data(html):
    soup = BeautifulSoup(html, "lxml")
    comment_list = soup.select('.comment > p')
    next_page = soup.select('.next')[0].get('href')
    return comment_list, next_page
current_page = absolute
next_page = ""
comment_list = []
temp_list = []
num = 0
while(1):
    html = requests.get(current_page, headers=headers).content
    temp_list,next_page = get_data(html)
    if next_page is None:
        break
    current_page = absolute + next_page
    comment_list = comment_list + temp_list
    #time.sleep(1 + float(random.randint(1, 100)) / 20)
    num = num + 1
#将爬取的评论写入txt文件中
with open("c:/comments.txt", 'a')as f:
    for node in comment_list:
        comment = node.get_text().strip().replace("\n", "")
        f.write(comment + "\n")
f.close()


fr.close()
fw.close()

四、将数据信息保存到数据库

import pymysql
import pymysql.cursors
inputFile = 'douban_movie_details.txt'
fr = open(inputFile, 'r')
firstLine = True
db = pymysql.connect(host='localhost',user='root',pwd='',database='douban')
cursor = db.cursor()
count = 0
for line in fr:
	if firstLine:
		firstLine = False
		continue
	line = line.split('^')
	movieId= line[0]
	title = line[1]
	url = line[2]
	cover = line[3]
	rate = line[4]
	director = line[5]
	screenWriter = line[6]
	actor = line[7]
	category = line[8]#类别
	district = line[9]#地区
	language = line[10]
	showtime = line[11]
	length = line[12]
	othername = line[13]
	description = line[14]
	cursor.execute('insert into movie(movieId,title,url,cover,rate,director,screenWriter,actor,category,district,language,showtime,length,othername,description) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',[movieId,title,url,cover,rate,director,screenWriter,actor,category,district,language,showtime,length,othername,description])
	count = count + 1
	print(count, title)
fr.close()
db.close()
cursor.close()

五、测试数据库

import pymysql
from IP_POOLS import read
import movie
import re

host = 'localhost'
user = 'root'
password = ''
database = 'douban'
db = pymysql.connect(host, user, password, database)#连接数据库
cursor = db.cursor()#游标

def create_table():#创建电影表
    cursor.execute("drop table if exists contents")
    sql="""
         create table contents(
            movie_id char(60) not null,
            movie_title char(200),
            movie_director char(200),
            movie_screenwriter char(200),
            movie_character varchar(1000),
            movie_type char(200),
            movie_country char(200)
                ) 
            """
    cursor.execute(sql)
    db.close()

def create_table_comment():#创建评论表
    cursor.execute("drop table if exists comment")
    sql="""
         create table comment(
            movie_id char(60) not null,
            movie_comment varchar(20000)
                ) 
            """
    cursor.execute(sql)
    db.close()
#插入数据
def insert_database(id,title,director,screenwriter,mcharacter,mtype,country):
    sql='insert into contents values("%s","%s","%s","%s","%s","%s","%s")'%(id,title,director,screenwriter,mcharacter,mtype,country)
    try:
        cursor.execute(sql)
        db.commit()
    except:
        db.rollback()
    db.close()
#添加评论
def insert_comment(id,comment):
    sql=' insert into comment values("%s","%s")'%(id,comment)
    try:
        cursor.execute(sql)
        db.commit()
    except:
        db.rollback()
    db.close()
#查询数据库
def select_database(id):
    sql='select movie_id from contents where movie_id="%s"'%id
    try:
        cursor.execute(sql)
        results=cursor.fetchall()#数据库查询操作 Python查询Mysql使用 fetchone() 方法获取单条数据, 使用fetchall() 方法获取多条数据。
        for row in results:
            m_id=row[0]
            print(("movie_id=%s")%m_id + 'ok!')
    except:
        print("Error:unable to fetch"+ id)
    db.close()

if __name__ == '__main__':
    create_table()
    create_table_comment()
    '''
    url = 'https://movie.douban.com/subject/26636712/'
    id = movie.get_movie_ID(url)
    a=movie.get_movie_title(url)
    b=movie.get_movie_directors(url)
    c=movie.get_movie_screenwriter(url)
    d=movie.get_movie_character(url)
    e=movie.get_movie_type(url)
    f=movie.get_movie_country(url)
    print(d,'\n',e,f)
    #insert_database(b,directors,b,b,b,b,b)
    #insert_database(id,'b','c','d','d','f','g')
    insert_database(id,a,b,c,d,e,f)
    insert_database('a','b',b, 'd', 'd', 'f', 'g')
    '''
    path = 'movie_url.txt'
    n=0
    for url in read(path):
        id=movie.get_movie_ID(url)
    #print(type(id))
    #print(id)
    #exit()
        title=movie.get_movie_title(url)
        director=movie.get_movie_directors(url)
        screenwriter=movie.get_movie_screenwriter(url)
        mcharacter=movie.get_movie_character(url)
        mtype=movie.get_movie_type(url)
        country=movie.get_movie_country(url)
        comment=movie.get_movie_commentary(url)
        print(id)
        print(title)
        print(type(title))
        insert_database(id,title,director,screenwriter,mcharacter,mtype,country)
        insert_comment(id,comment)
        select_database(id)
        n=n+1
        while(n==10):
            exit()

六、final:动态IP获取标签movie的信息存入到数据库，并制作词云

headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
#获取动态ip，防止ip被封
def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list
#随机从动态ip链表中选择一条ip
def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies
def get_movie_tags(proxy_ip):
	request = urllib.Request(url="http://movie.douban.com/j/search_tags?type=movie")
	response = urllib.urlopen(request)
	tags = json.loads(response.read())['tags']
	for tag in tags:
		print("Crawl movies with tag: " + tag)
def get_movie_urls_by_tag(proxy_ip,tag=''):
	soup = BeautifulSoup(tag, "lxml")
	tag_urls_list = soup.select('.tag > p')
	next_page = soup.select('.next')[0].get('href')
	return tag_urls_list, next_page
def get_movie_content(proxy_ip,url):
	outputFile = 'douban_movie.txt'
	fw = open(outputFile, 'w')
	fw.write('id;title;url;cover;rate\n')
	while True:
		request = urllib.Request(url=url)
		response = urllib.urlopen(request)  # urlopen()函数用于实现对目标url的访问
		movies = json.loads(response.read())['subjects']
		if len(movies) == 0:
			break
		for item in movies:
			rate = item['rate']
			title = item['title']  # 标题
			url = item['url']  # 链接
			cover = item['cover']  # 封面
			movieId = item['id']  # 电影id
			comment = item['comment']  # 短评
			record = str(movieId) + '%%%' + title + '%%%' + url + '%%%' + cover + '%%%' + str(rate) + '%%%' + comment + '\n'
		fw.write(record.encode('utf8'))
	fw.close()
def get_movie_detail(url):
	fr = open('input.txt', 'r')
	outputFile = 'douban_movie_details.txt'
	fw = open(outputFile, 'w')
	fw.write('id^title^url^cover^rate^director^composer^actor^category^district^language^showtime^length^othername^description\n')
	firstLine = True
	count = 1
	errorCount = 0
	result = {}
	for line in fr:
		if firstLine:
			firstLine = False
			continue
		line = line.split(';')
		movieId = line[0]
		title = line[1]
		url = line[2]
		cover = line[3]
		rate = line[4].rstrip('\n')
		if result.has_key(movieId):
			continue
		else:
			result[str(movieId)] = 1
		try:
			request = urllib.Request(url=url)
			# request = urllib2.Request(url=url)
			response = urllib.urlopen(request)
			html = response.read()
			html = BeautifulSoup(html)
			info = html.select('#info')[0]
			info = info.get_text().split('\n')
			# 提取字段，只要冒号后面的文本内容
			# 用于把一个字符串分割成字符串数组 #用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列
			director = info[1].split(':')[-1].strip()  # 导演
			screenWriter = info[2].split(':')[-1].strip()  # 编剧
			actor = info[3].split(':')[-1].strip()  # 演员
			category = info[4].split(':')[-1].strip()  # 类别
			district = info[6].split(':')[-1].strip()  # 地区
			language = info[7].split(':')[-1].strip()  # 语言
			showtime = info[8].split(':')[-1].strip()  # 上映时间
			length = info[9].split(':')[-1].strip()  # 时长
			othername = info[10].split(':')[-1].strip()  # 别名
			# 电影简介
			description = html.find_all("span", attrs={"property": "v:summary"})[0].get_text()
			description = description.lstrip().lstrip('\n\t').rstrip().rstrip('\n\t').replace('\n', '\t')
			# 写入数据
			record = str(movieId) + '^' + title + '^' + url + '^' + cover + '^' + str(rate) + '^' + director.encode(
				'utf8') + '^' + screenWriter.encode('utf8') + '^' + actor.encode('utf8') + '^' + category.encode(
				'utf8') + '^' + district.encode('utf8') + '^' + language.encode('utf8') + '^' + showtime.encode(
				'utf8') + '^' + length.encode('utf8') + '^' + othername.encode('utf8') + '^' + description.encode(
				'utf8') + '\n'
			fw.write(record)
			print(count, title)
			time.sleep(5)
		except URLError as e:
			print(e)
			print(count, title, "Error")
			errorCount = errorCount + 1
		else:
			pass
		finally:
			pass
		count = count + 1
		print(count, errorCount)
def connect_mysql(host,user,pwd,database):
	conn=pymysql.connect(host,user,pwd,database)
	conn.autocommit(1)
	cursor = conn.cursor()
	cursor.execute('drop database if exists %s'% database)
def is_exists_movie_id(movie_id):
    db = pymysql.connect(host,user,pwd,database)
    cursor=db.cursor()
    sql="select * from movie where movie_id=%s" % movie_id
    cursor.execute(sql)
    one=cursor.fetchone()
    db.commit()
    if(one is None):
        return False
    else:
        return True
def insert_movie_detail(movie):
    db = pymysql.connect(host,user,pwd,database)
    cursor = db.cursor()
    sql="insert into movie values('%s','%s','%s','%s','%s','%s','%s','%s','%s',%d,'%s','%s')" % movie
    if(is_exists_movie_id(movie[0])):
        pass
    else:
        cursor.execute(sql)
        print(sql)
        db.commit()
def get_movie_short_comments(html):
	absolute = 'https://movie.douban.com/subject/26322642/comments'
	#soup = BeautifulSoup(html, "lxml")
	#comment_list = soup.select('.comment > p')
	#next_page = soup.select('.next')[0].get('href')
	#return comment_list, next_page
	current_page = absolute
	next_page = ""
	comment_list = []
	temp_list = []
	num = 0
	while (1):
		html = requests.get(current_page, headers=headers).content
		temp_list, next_page = get_movie_short_comments(html)
		if next_page is None:
			break
		current_page = absolute + next_page
		comment_list = comment_list + temp_list
		# time.sleep(1 + float(random.randint(1, 100)) / 20)
		num = num + 1
		# 将爬取的评论写入txt文件中
		with open("c:/comments.{}", 'a')as f:
			for node in comment_list:
				comment = node.get_text().strip().replace("\n", "")
				f.write(comment + "\n")
def insert_movie_short_comments(movie,url,headers):
	html = requests.get(url, headers=headers)
	selector = etree.HTML(html.text)
	# XPATH基本上是用一种类似目录树的方法来描述在XML文档中的路径。
	hrefs = selector.xpath(' // [ @ id =”content”] / div / div[1] / div[2] / a / @ href')
	hrefs.insert(0,'')#发现没有第一页，因此在此添加一个代码
	for href in hrefs:
		string = url + href
		print(string +"\n")#添加输出查看url是否全部在,会发现没有第一页
		j = [0]
		j[0] += 2
	names = selector.xpath('// *[ @ id ="content"] / div / div[1] / ol / li / div / div[2] / div[1] / a / span[1] / text()')
	others = selector.xpath('// *[ @ id ="content"] / div / div[1] / ol / li / div / div[2] / div[2] / p[1] / text()')
	nums = selector.xpath('// *[ @ id ="content"] / div / div[1] / ol / li / div / div[2] / div[2] / div / span[2] / text()')
	evaluates = selector.xpath('// *[ @ id ="content"] / div / div[1] / ol / li / div / div[2] / div[2] / div / span[4] / text()')
	briefs = selector.xpath(' // *[ @ id ="content"] / div / div[1] / ol / li / div / div[2] / div[2] / p[2] / span / text()')
	while j[0] <= 25:
		temp = selector.xpath(' // *[ @ id ="content"] / div / div[1] / ol / li[' + str(j[0]) + '] / div / div[2] / div[2] / p[2] / span / text()')
		if len(temp) == 0:
			temp = 'null'  # 因为某些电影没有brief，如247，所以用N来代替，
			for h in temp:
				briefs.insert(j[0], h)
				j[0] += 1
			# 以上七行代码处理有些没有brief的情况
	for name, other, num, evaluate, brief in zip(names, others, nums, evaluates, briefs):
		#以下是插入数据库的代码
		db = pymysql.connect(host,user,pwd,database )
		cursor = db.cursor()  # 获得python执行pymysql命令的方法，操作游标
		sql = "insert into douban(name, other, num, evaluate, brief) values('%s','%s',' %s','%s','%s')" % (name, other, num, evaluate, brief.replace("\", "))
		#出错，出现特殊字符，解决方法replace(‘\”,”)
		cursor.execute(sql)
		db.commit()

host = 'localhost'
user = 'root'
pwd = ''
database = 'douban'
def wordcloud():
	file=codecs.open('comments.{}','r',encoding='utf-8')
	text=file.read()
	wordlist=jieba.cut(text)
	word_str=''
	for word in wordlist:
    	 if(len(word)>1):
        	word_str=word_str+" "+word  #字符串的拼接
	result = jieba.lcut(open('comments.txt').read())
	wc=WordCloud(background_color='black',
        		max_words=2000,
        		font_path='c:/simhei.ttf',
        		stopwords = ["的", "这种", "这样", "还是", "就是", "这个"], #设置停用词
        		max_font_size=150,
        		random_state=50,
        		width=1600,
        		height=900,
        		margin=2
        		)
	my_word_cloud=wc.generate(word_str)
	plt.imshow(my_word_cloud)
	plt.axis('off')
	plt.show()
#con=connect_mysql()
#insert_movie_detail(con,get_movie_detail(content=get_movie_content(get_random_ip(get_ip_list()))))