爬虫第二篇
1.将爬取的数据存入数据库
1. Anaconda安装模块
1. 进入到Anaconda Prompt终端(管理员身份)
2. 执行安装命令
conda install pymongo
conda install pymysql
2. 远程存入MySQL数据库
1. 开启远程连接,
注释掉: # bind-address=127.0.0.1
/etc/mysql/mysql.conf.d/mysqld.cnf
改完之后重启mysql服务
2. 添加授权用户
mysql> grant all privileges on *.* to "用户名"@"%" identified by "123456" with grant option;
3. 添加规则允许外部访问3306端口
sudo ufw allow 3306
3. Ubuntu中防火墙(ufw)基本操作
1. 打开 : sudo ufw enable
2. 关闭 : sudo ufw disable
3. 添加规则 : sudo ufw allow 端口号
# 存入mongodb
import urllib.request
import re
import pymongo
class MaoyanSpider:
def __init__(self):
self.baseurl = "https://maoyan.com/board/4?offset="
self.headers = {"User-Agent":"Mozilla/5.0"}
self.offset = 0
# 连接对象
self.conn = pymongo.MongoClient("192.168.56.129",27017)
# 库对象
self.db = self.conn["MaoDB"]
# 集合对象
self.myset = self.db["film"]
# 获取页面
def getPage(self,url):
req = urllib.request.Request(url,
headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode("utf-8")
self.parsePage(html)
# 解析页面
def parsePage(self,html):
# 创建编译对象
regex = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
rList = regex.findall(html)
# rList:[("霸王别姬","张国荣","1993"),()]
self.writeTomongo(rList)
# 保存数据
def writeTomongo(self,rList):
for r in rList:
d = {
"name" : r[0].strip(),
"star" : r[1].strip(),
"releasetime" : r[2].strip()
}
self.myset.insert_one(d)
print("成功存入MaoDB库")
# 主函数
def workOn(self):
while True:
c = input("爬取按y,退出按q:")
if c.strip().lower() == "y":
url = self.baseurl +\
str(self.offset)
self.getPage(url)
self.offset += 10
else:
print("爬取结束")
break
#for i in range(0,91,10):
# url = self.baseurl + str(i)
# self.getPage(url)
# time.sleep(0.1)
if __name__ == "__main__":
spider = MaoyanSpider()
spider.workOn()
# 存入mysql
import urllib.request
import re
import pymysql
import warnings
class MaoyanSpider:
def __init__(self):
self.baseurl = "https://maoyan.com/board/4?offset="
self.headers = {"User-Agent":"Mozilla/5.0"}
self.offset = 0
# 数据库连接对象
self.db = pymysql.connect(
"192.168.56.129",
"lion",
"123456",
"spiderdb",
charset="utf8")
# 游标对象
self.cursor = self.db.cursor()
# 获取页面
def getPage(self,url):
req = urllib.request.Request(url,
headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode("utf-8")
self.parsePage(html)
# 解析页面
def parsePage(self,html):
# 创建编译对象
regex = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
rList = regex.findall(html)
# rList:[("霸王别姬","张国荣","1993"),()]
self.writeToMysql(rList)
# 保存数据
def writeToMysql(self,rList):
# 忽略下面语句的所有警告
warnings.filterwarnings("ignore")
ins = 'insert into film(name,star,releasetime) values(%s,%s,%s)'
for r in rList:
L = [r[0].strip(),r[1].strip(), r[2].strip()[5:15]]
# execute必须使用列表传参
self.cursor.execute(ins,L)
# 提交到数据库执行
self.db.commit()
# 主函数
def workOn(self):
while True:
c = input("爬取按y,退出按q:")
if c.strip().lower() == "y":
url = self.baseurl + str(self.offset)
self.getPage(url)
self.offset += 10
else:
print("爬取结束")
# 必须等所有爬完之后再关闭
self.cursor.close()
self.db.close()
break
#for i in range(0,91,10):
# url = self.baseurl + str(i)
# self.getPage(url)
# time.sleep(0.1)
if __name__ == "__main__":
spider = MaoyanSpider()
spider.workOn()