第一个项目总结
本项目是编写一个基于django的系统,实现分布式爬虫的对接以及爬取数据的可视化。
Django配置
数据库使用磁盘数据库sqlite
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': 'D:/sqlitedb/finally_data.db',
}
}
静态文件的使用image、html、css、js
STATIC_URL = '/static/'
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
STATICFILES_DIRS = [
('css', os.path.join(STATIC_ROOT, 'css')),
('img', os.path.join(STATIC_ROOT, 'img')),
('dynamic photo', os.path.join(STATIC_ROOT, 'dynamic photo')),
('js', os.path.join(STATIC_ROOT, 'js')),
]
在于manage.py同级目录下创建文件如下
功能实现
用户的注册、登录
建立名为user的app并编写视图函数,使用内置User模型。
前端代码中对应的input要加上name属性,views.py对应进行request.POST.get(‘username’,’’)和request.POST.get(‘password’,’’)
# urls.py
urlpatterns = [
path('login.html', loginView, name="login"),
path('register.html', registerView, name="register"),
]
# views.py
import re
from django.contrib import messages
from django.contrib.auth import login, authenticate
from django.contrib.auth.models import User
from django.shortcuts import render, redirect
def sub_config_file(username, password):
'''
修改spiderkeeper模块下的config.py文件中的登录账号和密码
:param username:
:param password:
:return:
'''
path = ''
with open(path, 'r+', encoding='utf8') as f:
text = f.read()
f.seek(0)
f.truncate()
text = re.sub("BASIC_AUTH_USERNAME = 'admin'", "BASIC_AUTH_USERNAME = '{}'".format(username), text, re.S)
text = re.sub("BASIC_AUTH_PASSWORD = 'admin'", "BASIC_AUTH_PASSWORD = '{}'".format(password), text, re.S)
f.write(text)
def loginView(request):
if request.method == 'POST':
username = request.POST.get('username', '')
password = request.POST.get('password', '')
print(username, password)
if User.objects.filter(username=username):
user = authenticate(username=username, password=password)
print('111')
if user:
import pickle
with open('D:/the_flag.pk', 'wb') as fp:
pickle.dump('ex', fp)
print('222')
if user.is_active:
login(request, user)
return redirect('trackBack')
else:
print('333')
tips = '账号密码错误,请重新输入'
messages.error(request, tips)
return render(request, 'index.html', locals())
else:
print('444')
tips = '用户不存在,请注册'
messages.error(request, tips)
return render(request, 'index.html', locals())
return redirect('/')
# 登录直接跳5000了
def registerView(request):
if request.method == 'POST':
username = request.POST.get('username', '')
password = request.POST.get('password', '')
print(username, password)
if User.objects.filter(username=username):
print('555')
tips = '用户已经存在'
messages.info(request, tips)
else:
print('666')
user = User.objects.create_user(username=username, password=password)
user.save()
return render(request, 'index.html')
return render(request, 'index.html', locals())
html中加这个才会弹窗,执行message的提示。
{# 执行弹窗操作 #}
{% if messages %}
<script>
{% for msg in messages %}
alert('{{ msg.message }}');
{% endfor %}
</script>
{% endif %}
<script>
实现右上角显示用户的部分
{% if request.user.is_authenticated %}
<li><a href="#">你好!{{ request.user.username}},欢迎使用本系统</a></li>
<li><a href="#">退出</a></li>
{% else %}
<li><a href="#" data-toggle="modal" data-target=".bs-example-modal-sm1">注册</a></li>
<li><a href="#" data-toggle="modal" data-target=".bs-example-modal-sm">登录</a></li>
{% endif %}
当用户登录以后,会给管理员发送邮件
使用smtplib库
import smtplib
from email.mime.text import MIMEText
from email.utils import formataddr
def sent_email():
my_sender = '@qq.com' # 发件人qq邮箱
my_pass = '' # 去qq邮箱里面注册的开启stmp协议的密码
my_user = '@qq.com' # 收件人qq邮箱
def mail():
ret = True
try:
msg = MIMEText('请开始爬取', 'plain', 'utf-8')
# 括号里的对应发件人邮箱昵称、发件人邮箱账号
msg['From'] = formataddr(["FromRunoob", my_sender])
# 括号里的对应收件人邮箱昵称、收件人邮箱账号
msg['To'] = formataddr(["FK", my_user])
msg['Subject'] = "客户端" # 邮件的主题,也可以说是标题
server = smtplib.SMTP_SSL("smtp.qq.com", 465) # 发件人邮箱中的SMTP服务器,端口是25
server.login(my_sender, my_pass) # 括号中对应的是发件人邮箱账号、邮箱密码
# 括号中对应的是发件人邮箱账号、收件人邮箱账号、发送邮件
server.sendmail(my_sender, [my_user, ], msg.as_string())
server.quit() # 关闭连接
except Exception: # 如果 try 中的语句没有执行,则会执行下面的 ret=False
ret = False
return ret
ret = mail()
if ret:
print("邮件发送成功")
else:
print("邮件发送失败")
数据库迁移,把数据从redis迁移至sqlite
要注意的是此处写入数据库时候的优化,不是写一条SQL语句就提交一次。先把多条语句写入内存,再提交,大概比一条一条快几十倍。最大内存写入瓶颈是200条左右。
import time
import json
import redis
import sqlite3
def exchange_data():
# 指定Redis数据库信息
rediscli = redis.StrictRedis(host='', port=6379, db=0)
# 指定MongoDB数据库信息
# mongocli = pymongo.MongoClient(host='localhost', port=27017)
# 连接sqlite数据库
conn = sqlite3.connect('D:/sqlitedb/finally_data.db')
# 创建数据库名
# db = mongocli['dangdang']
# 创建表名
# sheet = db['dangdangbook']
for i in range(10000):
# FIFO模式为 blpop,LIFO模式为 brpop,获取键值
source, data = rediscli.blpop(["jh:items"])
data = data.decode('utf-8')
item = json.loads(data)
print(i)
print(item)
# TODO: 取出各种字段
goods_description = item.get("goods_description")
goods_publish_time = item.get("goods_publish_time")
goods_now_price = item.get("goods_now_price")
goods_author = item.get("goods_author")
goods_publish = item.get("goods_publish")
classify_kind_name = item.get("classify_kind_name")
goods_review = item.get("goods_review")
kind_detail = item.get("kind_detail")
goods_popular = item.get("goods_popular")
if goods_popular == "":
goods_popular = 0
else:
goods_popular = int(goods_popular)
goods_img_url = item.get("goods_img_url")
id = ''
sql = "insert into detail_book(title,comments,author,large_category,small_category,publisher,img,price,rate) values('%s',%d,'%s','%s','%s','%s','%s',%f,%d)" % (
str(goods_description),
int(goods_review),
str(goods_author),
str(classify_kind_name),
str(kind_detail),
str(goods_publish),
str(goods_img_url),
float(goods_now_price),
goods_popular,
)
if i % 200 == 0:
conn.commit()
else:
try:
print(item)
conn.execute(sql)
except:
pass
实现一个大分类下出现小分类(淘宝、京东那种)
先对所有的大分类进行过滤去重,在遍历这个去重了的列表,对对应的每一个大分类进行数据库的索引,搜索出下面的小分类并去重生成列表,生成一个字典,以大分类名为键,对应的小分类的值为列表。传入到模板中。
另一点要注意的是不能直接对Queryset对象的序列进行遍历,为惰性序列,所以要先list()一下。
def compare(request):
category_list = Book.objects.values('large_category').distinct()[:35]
# small_category_list = Book.objects.values('large_category').distinct().values('small_category').distinct()
name_list = list(category_list)[:35]
small_category_dict = {}
for category in name_list:
small_category_dict[category['large_category']] = Book.objects.filter(large_category=category['large_category']).values_list('small_category').distinct()
# print(small_category_dict[category['large_category']])
# x = LocalTime()
# time_dict = x.read_txt()
context = {
'category_list': category_list,
'small_category_dict': small_category_dict,
# 'time_dict': time_dict,
}
return render(request, 'analysis.html', context=context)
商品列表的展示
# urls.py
urlpatterns = [
# path('', listing),
path('search', search, name='search'),
# todo 传递一个flag表明当前所在的是大分类还是小分类
path('category=<category_name>', get_category, name='show1'),
path('ccategory=<category_name>', get_small_category, name='show2'),
path('callback=<category_name>', callback, name='callback'),
# todo 没办法,只能把小分类和大分类的排序函数分开了,并设定url一者为categery=,另一者为ccategory=
path('sort_by_price=<category_name>', sort_by_price, name='sort_by_price'),
path('sort_by_comments=<category_name>', sort_by_comments, name='sort_by_comments'),
path('go!!!<page>', get_input_page, name='input_page'),
]
# views.py
from django.core.paginator import Paginator, EmptyPage, PageNotAnInteger
from django.db.models import Q
from django.shortcuts import render, redirect
from detail.photo_crawler import PhotoCrawler
from .models import *
import pickle
# 执行某一页对应的图片的爬取
def download(books):
img_list = []
url_list = []
compare_img = []
for book in books:
if book.img != 'None':
url = book.img
url_list.append(url)
img_list.append('img/' + book.img[-16:])
compare_img.append(book.img[-16:])
else:
img_list.append('')
compare_img.append('NA')
crawler = PhotoCrawler()
crawler.more_processing(url_list)
new_list = list(zip(books, img_list, compare_img))
return new_list, books
# 商品展示
def listing(request):
book_list = Book.objects.all()
paginator = Paginator(book_list, 12) # 30个一页
page = request.GET.get('page')
try:
books = paginator.page(page)
except PageNotAnInteger:
books = paginator.page(1)
except EmptyPage:
books = paginator.page(paginator.num_pages)
# 执行图片下载
new_list, books = download(books)
return render(request, 'details.html', {'books': books, 'new_list': new_list})
# 大分类页和详情页的搜索功能
def search(request):
# 搜索的时候重定向不一样,需要携带参数,所以进行url的拼接
first = request.path
second = request.GET.get('keyword')
with open('D:/referer_url.pk', 'wb') as fp:
pickle.dump(first+'?keyword='+second, fp)
if request.method == 'GET':
# 设置keyword的默认值为''空
keyword = request.GET.get('keyword', '')
# 搜索标题,作者,出版社中有搜索关键字的信息
book_list = Book.objects.filter(
Q(title__icontains=keyword) | Q(author__icontains=keyword) | Q(publisher__icontains=keyword))
paginator = Paginator(book_list, 12)
page = request.GET.get('page')
try:
books = paginator.page(page)
except PageNotAnInteger:
books = paginator.page(1)
except EmptyPage:
books = paginator.page(paginator.num_pages)
# 执行图片下载
new_list, books = download(books)
# 此处的category_name为搜索的keyword的名字,用于搜索后的排序和分页
return render(request, 'details.html',
{'books': books, 'category_name': keyword + 'flagshere', 'new_list': new_list})
# 从外面的大分类点击 “查看更多”的时候进入的分页
def get_category(request, category_name):
book_list = Book.objects.filter(large_category=category_name)
# 实现点击返回原页面排序的操作
with open('D:/referer_url.pk', 'wb') as fp:
pickle.dump(request.path, fp)
paginator = Paginator(book_list, 12) # 30个一页
page = request.GET.get('page')
try:
books = paginator.page(page)
except PageNotAnInteger:
books = paginator.page(1)
except EmptyPage:
books = paginator.page(paginator.num_pages)
# 执行图片下载
new_list, books = download(books)
return render(request, 'details.html', {'new_list': new_list, 'books': books, 'category_name': category_name})
# 通过小分类进行商品的筛选
def get_small_category(request, category_name):
book_list = Book.objects.filter(small_category=category_name)
paginator = Paginator(book_list, 12) # 30个一页
with open('D:/referer_url.pk', 'wb') as fp:
pickle.dump(request.path, fp)
page = request.GET.get('page')
try:
books = paginator.page(page)
except PageNotAnInteger:
books = paginator.page(1)
except EmptyPage:
books = paginator.page(paginator.num_pages)
# 执行图片下载
new_list, books = download(books)
return render(request, 'details.html', {'new_list': new_list, 'books': books, 'category_name': category_name})
'''问题:实现 搜索后的排序,分类后的排序'''
# 再次点击综合排序的时候,返回最初的排序
def callback(request, category_name):
# referer = request.META.get('HTTP_REFERER')
# flag为1是小分类的标志。flag为0是大分类的标志
# if flag == 1:
# return redirect('show2', category_name=category_name, flag=1)
with open('D:/referer_url.pk', 'rb') as fp:
go_to = pickle.load(fp)
return redirect(go_to)
# 通过价格排序
def sort_by_price(request, category_name):
# 设置搜索后的标签
if category_name.endswith('flagshere'):
# if flag == 3:
category_name = category_name.replace('flagshere', '')
book_list = Book.objects.filter(Q(title__icontains=category_name) | Q(author__icontains=category_name) | Q(
publisher__icontains=category_name)).order_by('-price')
paginator = Paginator(book_list, 12) # 30个一页
page = request.GET.get('page')
try:
books = paginator.page(page)
except PageNotAnInteger:
books = paginator.page(1)
except EmptyPage:
books = paginator.page(paginator.num_pages)
category_name += 'flagshere'
# 执行图片下载
new_list, books = download(books)
witch = 'pri'
return render(request, 'details.html', {'new_list': new_list, 'books': books, 'category_name': category_name, 'witch':witch})
# 否则就是来自大分类的大兄弟了
else:
book_list = Book.objects.filter(large_category=category_name).order_by('-price')
if book_list:
pass
else:
book_list = Book.objects.filter(small_category=category_name).order_by('-price')
paginator = Paginator(book_list, 12) # 30个一页
page = request.GET.get('page')
try:
books = paginator.page(page)
except PageNotAnInteger:
books = paginator.page(1)
except EmptyPage:
books = paginator.page(paginator.num_pages)
# 执行图片下载
new_list, books = download(books)
witch = 'pri'
return render(request, 'details.html', {'new_list': new_list, 'books': books, 'category_name': category_name, 'witch':witch})
# 按照评论数排序
def sort_by_comments(request, category_name):
# 设置搜索后的标签
if category_name.endswith('flagshere'):
# if flag == 3:
category_name = category_name.replace('flagshere', '')
book_list = Book.objects.filter(Q(title__icontains=category_name) | Q(author__icontains=category_name) | Q(
publisher__icontains=category_name)).order_by('-comments')
paginator = Paginator(book_list, 12) # 30个一页
page = request.GET.get('page')
try:
books = paginator.page(page)
except PageNotAnInteger:
books = paginator.page(1)
except EmptyPage:
books = paginator.page(paginator.num_pages)
category_name += 'flagshere'
# 执行图片下载
new_list, books = download(books)
witch = 'coms'
return render(request, 'details.html', {'new_list': new_list, 'books': books, 'category_name': category_name, 'witch':witch})
# 否则就是来自大分类的大兄弟了
else:
book_list = Book.objects.filter(large_category=category_name).order_by('-comments')
if book_list:
pass
else:
book_list = Book.objects.filter(small_category=category_name).order_by('-comments')
paginator = Paginator(book_list, 12) # 30个一页
page = request.GET.get('page')
try:
books = paginator.page(page)
except PageNotAnInteger:
books = paginator.page(1)
except EmptyPage:
books = paginator.page(paginator.num_pages)
# 执行图片下载
new_list, books = download(books)
witch = 'coms'
return render(request, 'details.html', {'new_list': new_list, 'books': books, 'category_name': category_name,'witch':witch})
def get_input_page(request,page):
page_number = request.GET.get('want_to_got_page')
referer = request.META.get('HTTP_REFERER')
print(page_number)
print(type(page))
if page == '1':
part_of_url = referer.split('page')[0]
url = part_of_url + '?page=' + page_number
else:
part_of_url = referer.split('page')[0]
url = part_of_url + 'page=' + page_number
return redirect(url)
注意,如果要跳转到不在自己服务器上的任意网页
可以使用href的静态链接,也可以用HttpResponseRedirect
from django.shortcuts import HttpResponseRedirect
视图函数下
return HttpResponseRedirect('想要跳转的url')
对图书进行快速爬取,此处有每一本书的url,使用多线程爬取。大概是普通爬的5倍。
import os
from multiprocessing.dummy import Pool
from urllib.request import urlretrieve
class PhotoCrawler:
# 爬取函数
def get_photo(self, url):
part_of_path = r'.\static\img'
path = os.path.join(part_of_path, url[-16:])
try:
# TODO 将这个项目文件放在D盘权限会更高
urlretrieve(url, path)
except:
pass
# 多线程爬取图片
def more_processing(self, url_list):
# 8个线程爬取
pool = Pool(8)
results = pool.map(self.get_photo, url_list)
pool.close()
pool.join()
自定义过滤器的使用
在于manage.py同级目录下创建文件,并添加文件至settings
在里面建立myfilter.py编写过滤器
from django import template
register = template.Library()
# 自定义过滤器实现模板变量的过滤
@register.filter
def myRepalce(value,args):
# 将搜索的字段的,改为/
old_value, new_value = args.split(':')
return value.replace(old_value,new_value)
# # 返回对应的字符串的出版社
@register.filter
def get_real_publisher(value):
try:
value = eval(value)
if len(value) == 1:
return value[0]
return value[-1]
except:
return value
# 返回对应字符串的作者
@register.filter
def get_real_author(value):
try:
value = eval(value)
if len(value) == 1:
return value[0]
value = value[:-2]
except:
return value
return ','.join(value)
# 在分页的按钮实现数字的减1
@register.filter
def get_sub(value):
value -= 1
return value
@register.filter
def clean_rate(value):
return int(value)*10
html中的使用
{% load myfilter %}
<span>{{ book.publisher|get_real_publisher }}</span>
搜索
<div class="search-block">
<form action="{% url 'search' %}" method="get">
<input type="text" name="keyword" />
<button type="submit">搜索</button>
</form>
</div>
对应的views.py里面request.GET.get(‘keyword’)
搜索后的分页
1.照常使用分页模型
2.使用js处理跳转的url
# 分页模型
# 商品展示
def listing(request):
book_list = Book.objects.all()
paginator = Paginator(book_list, 12)
page = request.GET.get('page')
try:
books = paginator.page(page)
except PageNotAnInteger:
books = paginator.page(1)
except EmptyPage:
books = paginator.page(paginator.num_pages)
# 执行图片下载
new_list, books = download(books)
return render(request, 'details.html', {'books': books, 'new_list': new_list})
html中
<ul class="pagination pull-right">
{% if books.has_previous %}
<li>
<a onclick="button_page(this,{{ books.number }})">
<span>«</span>
</a>
</li>
{% endif %}
<li><a onclick="button_page(this,{{ books.number }})" title="{{ books.number }}">{{ books.number }}</a></li>
<li><a "button_page(this,{{ books.number|add:"1" }})" class="" title="{{ books.number|add:"1" }}">{{ books.number|add:"1" }}</a></li>
<li><a "button_page(this,{{ books.number|add:"2" }})" class="" title="{{ books.number|add:"2" }}">{{ books.number|add:"2" }}</a></li>
<li><a "button_page(this,{{ books.number|add:"3" }})" class="" title="{{ books.number|add:"3" }}">{{ books.number|add:"3" }}</a></li>
<li><a "button_page(this,{{ books.number|add:"4" }})" class="" title="{{ books.number|add:"4" }}">{{ books.number|add:"4" }}</a></li>
<li><a href="">...</a></li>
<li><a onclick="button_page(this,{{ books.paginator.num_pages }})" title="{{ books.paginator.num_pages }}">{{ books.paginator.num_pages }}</a></li>
{# {{ books.paginator.num_pages }}#}
{% if books.has_next %}
<li>
<a onclick="nextPage(this)" title="点击此处可以翻到下一页哦!">
<span>»</span>
</a>
</li>
{% endif %}
</ul>
</nav>
处理url的js
//翻页js
function getUrl() {
var current_url = window.location.href;
var params = current_url.split('?');
var url = '';
var flag = 0;
//如果没有keyword这个字符串
if (current_url.indexOf("keyword") == -1) {
//没有参数
if (params.length == 1) {
url += '?'
}
else {
for (i = 0; i < params.length; i++) {
if (params[i].indexOf('page') == -1) {
if (i == 0) {
url += params[i] + '?';
url += params[i] + '?';
} else {
url += params[i] + '&';
}
}
}
}
} else {
var mm = current_url.split('&');
url = mm[0] + "&";
}
return url;
}
//下一页
function nextPage(node) {
var url = getUrl()
{% if books.has_next %}
var href = url + "page={{ books.next_page_number}}"
{% else %}
var href = url + "page={{ books.number }}"
{% endif %}
node.href = href
}
//上一页
function previousPage(node) {
var url = getUrl()
{% if books.has_previous %}
var href = url + "page={{ books.previous_page_number}}"
{% else %}
var href = url + "page={{ books.number }}"
{% endif %}
node.href = href
}
//点击某一页的按钮跳转到指定的那一页
//这个函数是后端人员写的~~滑稽
function button_page(node,mynumber) {
var url = getUrl()
var href = url + "page=" + mynumber
node.href = href
}
</script>