原始数据从当当网爬取到MongoDB里,导出为json文件,由于开始没爬图片,此处补上,没有使用数据库~
准备部分:
import pickle
import requests
import os
import time
class PhotoDownloader(object):
def __init__(self):
# 所有分类的字典
with open('D:/memory_dict.pk','rb') as fp:
memory_dict = pickle.load(fp)
self.memory_dict = memory_dict
self.total = 0
def downloader(self,photo_url,category):
try:
content = requests.get(photo_url).content
part_path = 'D:\\tianmaxingkong\static\img'
dir_name = os.path.join(part_path,category)
# 将所有的图片下载到以分类为名的文件夹下面
if os.path.exists(dir_name):
pass
else:
os.mkdir(dir_name)
path = os.path.join(dir_name, photo_url[-16:])
with open(path,'wb') as f:
f.write(content)
self.total += 1
except:
pass
def myFliter(self):
for category in self.memory_dict.keys():
# 取每个分类的前3张图片作为商品展示的第一页
goods = self.memory_dict.get(category)
photo_urls = [good.get('goods_img_url')[0] for good in goods]
for photo_url in photo_urls:
self.downloader(photo_url,category)
print(photo_url)
if __name__ == '__main__':
start = time.time()
my_downloader = PhotoDownloader()
my_downloader.myFliter()
end = time.time()
print('总共%s张图片' % my_downloader.total)
print('花费时间:',end - start)
进行数据的清洗和数据初始化到本地
以及搜索功能的实现
import re
import json
import demjson# 经过对比demjson的解析速度比json慢很多
import random
import pickle# 对每个大分类的内容以键名的形式保存到本地
import os
import requests
import time
def get_orgin_list():
# file_path = 'D:/dangdang_orgin_data.txt'
file_path = 'D:/dangdang666.json'
with open(file_path, 'r+', encoding='utf8') as fp:
orgin_string = fp.read()
# 匹配出源数据里面的每个json结构所组成的列表
pattern = r'{.*?{.*?}.*?}'
result_list = re.findall(pattern, orgin_string, re.S | re.I)
return result_list
def get_data():
result_list = get_orgin_list()
# 按总分类classify_kind_name进行分类
# 匹配出每个json字符串里面的分类部分,创建一个字典,如果键相同就组成一个列表
memory_dict = dict()
pattern_second = 'classify_kind_name.:\["(.*?)"\]'
# 产生一个非常大的分类为键,json字符串为值的列表的字典{'青春文学':[json_string_object1,json_string_object2...]
# 这个循环两个功能,一个是产生一个大分类字典,第二个是把每个字符串json转换为python对象字典
# 能在同一个循环完成,效率最好
for index,json_data in enumerate(result_list):
try:
json_data = json_data.strip()
result_list[index] = json.loads(json_data)
# 对图片url的值为空的进行数据清洗,把图片url没有的和没有作者出版社的数据进行清除
if result_list[index].get('goods_img_url') is None or result_list[index].get('goods_author')[0] =='无':
continue
# 查找分类,并把每个分类弄到对应的字典的键对应的列表里面
match = re.search('classify_kind_name.:\["(.*?)"\]', json_data, re.S | re.I)
if match:
# 处理键的名字,以便作为文件名
match = match.group(1).replace('/',',')
# 处理图片的名字为路径名+图片名,以便后续直接使用
path = 'img'
path += '/' + match + '/' + result_list[index]['goods_img_url'][0][-16:]
result_list[index]['goods_img_url'][0] = path
# print(result_list[index].get('goods_img_url'))
if match not in memory_dict:
memory_dict[match] = []
memory_dict[match].append(result_list[index])
except:
pass
with open('D:/memory_dict.pk','wb') as fp:
pickle.dump(memory_dict,fp)
with open('D:/result_list.pk','wb') as ff:
pickle.dump(result_list,ff)
return result_list, memory_dict
def search_keyword(keyword):
json_object_list, memory_dict = get_data()
# 记住flags要加在compile里面才管用
'''
对关键字进行处理,经测试输入C++会因为+是正则的关键字而报错
'''
keyword_list = list(keyword)
for index,ch in enumerate(keyword_list):
if ch in '{}[]-+?.><*$^()':
keyword_list[index] = '\\' + ch
keyword = ''.join(keyword_list)
print(keyword)
pattern = re.compile('.*' + keyword + '.*',re.I|re.S)# 贪婪匹配
# 存储标题,以及对应所有信息的的字典对象
answer_list = list()
num = 0
for json_object in json_object_list:
try:
title = json_object.get("goods_description")
# 为了使得在搜索出版社和作者的时候也可以搜索到对应的商品信息,在这个字段里面添加搜索
author_list = json_object.get("goods_author")
publish_list = json_object.get("goods_publish")
# 同一款商品从作者,出版社,题目中搜索关键字只搜索一次
flag = True
for author in author_list:
got_it = pattern.search(author)
if got_it:
got_it = got_it.group(0).strip()
answer_list.append(json_object)
flag = False
num += 1
if flag:
result_title = pattern.search(title)
if result_title:
result_title = result_title.group(0).strip()
answer_list.append(json_object)
num += 1
flag = False
if flag:
for publisher in publish_list:
publisher_got = pattern.search(publisher)
if publisher_got:
publisher_got = publisher_got.group(0).strip()
answer_list.append(json_object)
num += 1
except:
pass
for answer in answer_list:
print(answer)
print(num)
return answer_list
# 提取数据分割成页,以便翻页的时候或者要跳转页面的时候直接引用
def get_detail_at_first():
'''商品展示的第一页,很多大分类下面的每个分类进行展示'''
a, memory_dict = get_data()
# 创建以分类名为文件名的文件夹,里面文件名加上1,2,3,4来实现翻页的时候直接加载
for key in memory_dict.keys():
part_of_path = 'D:\\all_pickle'
# key_name = key.replace('/', ',')# 由于/会影响系统存储的路径,先替换为逗号
path = os.path.join(part_of_path,key)
if os.path.exists(path):
pass
else:
os.mkdir(path)
for i in range(1,6):
new_path = os.path.join(path,key + str(i) + '.pk')
with open(new_path,'wb') as fp:
pickle.dump(memory_dict[key][i*50:(i+1)*50], fp)
if __name__ == '__main__':
keyword = input('请输入你想要查询的商品的名字:')
start = time.time()
search_keyword(keyword)
end = time.time()
print(end - start)
Django部分:
from django.contrib import admin
from django.urls import path,include
from detail_page_show import views as detail_page_show_views
urlpatterns = [
path('admin/', admin.site.urls),
path('detail/',include('detail_page_show.urls'),name='detail'),
path('category/',include('category_show.urls'),name='category'),
]
settings.py
此处主要写配置前端文件的地方
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
STATICFILES_DIRS = [
('css', os.path.join(STATIC_ROOT, 'css')),
('img', os.path.join(STATIC_ROOT, 'img')),
('dynamic img',os.path.join(STATIC_ROOT,'dynamic img')),
('js', os.path.join(STATIC_ROOT, 'js')),
]
两个app,category_show、detail_page_show分别实现分类页和详情页
category_show
urls.py
from django.urls import path
from .views import *
urlpatterns = [
path('',show_first,name='show11'),
path('searched',search_keyword,name='search'),
]
from django.shortcuts import render
import pickle
import random
from random import shuffle
import os
import re
# Create your views here.
def show_first(request):
with open('D:/memory_dict.pk', 'rb') as fp:
memory_dict = pickle.load(fp)
all_keys = list(memory_dict.keys())
view_list = []
for key in all_keys:
data_dict = dict()
if key not in data_dict:
data_dict[key] = []
# 构建一个含有随机此分类下的5个对象,作为分类展示
data_dict[key].append(memory_dict[key][:7])
# 将图片那个键修改为图片url的后16位,对应图片的文件名
# for index,item in enumerate(data_dict[key][0]):
# data_dict[key][0][index]['goods_img_url'] = 'img/' + key + '/' + item['goods_img_url'][0][-16:]
# print(data_dict[key][0][index]['goods_img_url'])
view_list.append(data_dict)
context = {
'view_list': view_list,
}
for i in view_list:
print(i)
return render(request, 'category.html', context=context)
# 将查找到的那些内容的数据结构,根据数据的数量进行页面数量的划分
# 并下载好每一页为文件,以便search_keyword直接调用
def how_many_page():
with open('D:/answer_list.pk', 'rb') as fp:
answer_list = pickle.load(fp)
length = len(answer_list)
# 偏移量offset=50,设置最后一页也会展示出最后的内容
page_number = length//50
last = length%50
if page_number > 4:
page_number = 4
path = 'D:/search_pickle'
for i in range(page_number+1):
new_path = os.path.join(path, str(i+1) + '.pk')
if i == page_number + 1 and i != 4:
with open(new_path, 'wb') as f:
pickle.dump(answer_list[i*50:i*50+last], f)
else:
with open(new_path, 'wb') as f:
pickle.dump(answer_list[i*50:(i+1)*50], f)
def search_keyword(request):
with open('D:/result_list.pk', 'rb') as fp:
json_object_list = pickle.load(fp)
# 记住flags要加在compile里面才管用
'''
对关键字进行处理,经测试输入C++会因为+是正则的关键字而报错
'''
keyword = request.GET.get('keyword')
keyword_list = list(keyword)
for index, ch in enumerate(keyword_list):
if ch in '{}[]-+?.><*$^()':
keyword_list[index] = '\\' + ch
keyword = ''.join(keyword_list)
print(keyword)
pattern = re.compile('.*' + keyword + '.*', re.I | re.S) # 贪婪匹配
# 存储标题,以及对应所有信息的的字典对象
answer_list = list()
for json_object in json_object_list:
try:
title = json_object.get("goods_description")
# 为了使得在搜索出版社和作者的时候也可以搜索到对应的商品信息,在这个字段里面添加搜索
author_list = json_object.get("goods_author")
publish_list = json_object.get("goods_publish")
# 同一款商品从作者,出版社,题目中搜索关键字只搜索一次
flag = True
for author in author_list:
got_it = pattern.search(author)
if got_it:
# got_it = got_it.group(0).strip()
answer_list.append(json_object)
flag = False
if flag:
result_title = pattern.search(title)
if result_title:
# result_title = result_title.group(0).strip()
answer_list.append(json_object)
flag = False
if flag:
for publisher in publish_list:
publisher_got = pattern.search(publisher)
if publisher_got:
# publisher_got = publisher_got.group(0).strip()
answer_list.append(json_object)
except:
pass
# for answer in answer_list:
# print(answer)
with open('D:/answer_list.pk', 'wb') as fff:
pickle.dump(answer_list, fff)
# 调用根据查找到的数据下载为特定的分页文件
how_many_page()
with open('D:/search_pickle/1.pk','rb') as ff:
show_list = pickle.load(ff)
context = {
'data_list':show_list,
'pp_page': 2,
'detail_name': 'searchhhh',
}
return render(request,'detail_page.html',context=context)
# 如果只是一页的话就不显示下面的翻页栏了
def next_page(request,page_now):
path = 'D:/search_pickle'
path = os.path.join(path,str(page_now) + '.pk')
with open(path,'rb') as f:
alist = pickle.load(f)
context = {
'data_list':alist,
'pp_page': page_now + 1,
'detail_name': 'searchhhh',
}
return render(request,'detail_page.html',context=context)
detail_page_now
from django.urls import path
from .views import *
urlpatterns = [
path('<detail_name>/',show_first,name='show1'),
path('<detail_name>/<int:page_now>/',next_page111,name='next_page'),
path('<detail_name>/<int:page_now>/sort_by_price',sort_by_price,name="sort_by_price"),
path('<detail_name>/<int:page_now>/sort_by_comment',sort_by_comment,name="sort_by_comment"),
path('<detail_name>/<int:page_now>/callback',callback,name="callback"),
]
import os
import pickle
import re
from django.shortcuts import render
from category_show.views import how_many_page# 从分类的视图里面调用下载页面函数
# Create your views here.
def show_first(request, detail_name):
# 获取当前的页数,返回当前页数对应的数据
page = 1
path = os.path.join('D:/all_pickle', detail_name)
# with open('D:/remember_category.pk','wb') as f:
# pickle.dump(path,f)
path = os.path.join(path, detail_name + str(page) + '.pk')
with open(path, 'rb') as fp:
data_list = pickle.load(fp)
context = {
'data_list': data_list,
'pp_page': page + 1,
'detail_name': detail_name,
}
return render(request, 'detail_page.html', context=context)
# 翻页处理
def next_page111(request, detail_name, page_now):
# with open('D:/page.pk','rb') as f:
# page = pickle.load()
# with open('D:/remember_category.pk','rb') as fp:
# path = pickle.load(fp)
# 如果时来自搜索的话,就换掉文件读取的路径和名字
if detail_name == 'searchhhh':
path = 'D:/search_pickle/' + str(page_now) + '.pk'
else:
path = 'D:/all_pickle/' + detail_name + '/' + detail_name + str(page_now) + '.pk'
print(path)
with open(path, 'rb') as f:
data_list = pickle.load(f)
context = {
'data_list': data_list,
'pp_page': page_now + 1,
'detail_name': detail_name,
}
return render(request, 'detail_page.html', context=context)
# 按照价格进行排序
def sort_by_price(request,detail_name,page_now):
# 修改
with open('D:/answer_list.pk','rb') as fp:
answer_list = pickle.load(fp)
answer_list.sort(key=lambda item: float(item["goods_now_price"][1:]),reverse=True)
# 保存
with open('D:/answer_list.pk','wb') as fp1:
pickle.dump(answer_list,fp1)
# 加载分页文件
how_many_page()
with open('D:/search_pickle/1.pk','rb') as ff:
show_list = pickle.load(ff)
context = {
'data_list':show_list,
'pp_page': page_now + 1,
'detail_name': detail_name,
}
return render(request,'detail_page.html',context=context)
# 按照评论数今行排序
def sort_by_comment(request,detail_name,page_now):
# 修改
with open('D:/answer_list.pk', 'rb') as fp:
answer_list = pickle.load(fp)
answer_list.sort(key=lambda item: int(item["goods_review"][:-3]), reverse=True)
# 保存
with open('D:/answer_list.pk', 'wb') as fp1:
pickle.dump(answer_list,fp1)
# 加载分页文件
how_many_page()
with open('D:/search_pickle/1.pk','rb') as ff:
show_list = pickle.load(ff)
context = {
'data_list':show_list,
'pp_page': page_now + 1,
'detail_name': detail_name,
}
return render(request,'detail_page.html',context=context)
# 点击综合排序返回首次出现的页面
def callback(request,detail_name,page_now):
if detail_name == 'searchhhh':
path = 'D:/search_pickle/' + str(page_now) + '.pk'
else:
path = 'D:/all_pickle/' + detail_name + '/' + detail_name + str(page_now) + '.pk'
with open(path,'rb') as ff:
show_list = pickle.load(ff)
context = {
'data_list':show_list,
'pp_page': 1,
'detail_name': detail_name,
}
return render(request,'detail_page.html',context=context)
def sort_by_release_time(request):
pass
前端文件
category.html
<!DOCTYPE html>
{% load static %}
<html lang="en">
<head>
<meta charset="UTF-8">
<title>分类</title>
<style>
* {
margin: 0;
padding: 0;
}
.search {
width: 374px;
margin: 50px auto;
}
.search input {
float: left;
}
.search input[type=text] {
width: 300px;
height: 35px;
}
.search input[type=button] {
height: 39px;
width: 70px;
}
.big {
width: 1300px;
margin: 0 auto;
}
.main {
width: 100%;
}
.zuo,
.you {
margin: 0 auto;
margin-top: 50px;
width: 100%;
border: 1px solid #ccc;
/*height: 2px;
background-color: red;*/
}
.zuo {
float: left;
}
.you {
float: right;
}
.sma1,
.sma2 {
width: 100%;
height: 40px;
background-color: #eee;
}
.kuai {
display: inline-block;
margin-left: 15px;
margin-top: 12px;
width: 4px;
height: 16px;
border-radius: 3px;
background-color: #ff2832;
}
.otext .duo {
display: inline-block;
text-align: center;
line-height: 40px;
float: right;
margin-right: 20px;
}
.duo a {
text-decoration: none;
color: #0b6fde;
}
h3 {
line-height: 16px;
margin-top: -21px;
margin-left: 28px;
}
h3 a {
color: #000;
text-decoration: none;
font-family: MicroSoft YaHei;
font-size: 16px;
font-weight: bold;
}
.otext {
width: 100%;
line-height: 35px;
color: #f00;
background-color: #ccc;
font-size: 15px;
}
.otext h4 {
margin-left: 20px;
display: inline-block;
}
.otext .duo :hover,
.otext h4:hover {
text-decoration: underline;
}
.tu {
padding: 20px;
}
.tu img {
width: 158px;
}
.intext .tu {
float: left;
width: 150px;
height: 264px;
}
.intext .tu div {
width: 140px;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
</style>
</head>
<body>
<div class="big">
<div class="search">
<form action="{% url 'search' %}" method="get">
<input type="text" name="keyword" placeholder="请输入搜索内容" />
<button type="submit">搜索</button><!-- 搜索 -->
</form>
</div>
<div class="main">
{% for view in view_list %}
<div class="zuo">
<div class="sma1">
<span class="kuai"></span>
<h3><a href="#">图书</a></h3>
</div>
{% for key,value in view.items %}
<div class="intext">
<div class="otext">
<h4>{{ key }}</h4>
{# ?detail_name={{ key }}#}
<div class="duo"><a href="{% url 'show1' detail_name=key %}">查看更多</a></div>
</div>
<div class="tu">
<img src={% static value.0.0.goods_img_url.0 %} alt="">
<div>{{ value.0.0.goods_description }}</div>
<p>价格:{{ value.0.0.goods_now_price }}</p>
<p>{{ value.0.0.goods_review }}</p>
</div>
<div class="tu">
<img src={% static value.0.1.goods_img_url.0 %} alt="">
<div>{{ value.0.1.goods_description }}</div>
<p>价格:{{ value.0.1.goods_now_price }}</p>
<p>{{ value.0.1.goods_review }}</p>
</div>
<div class="tu">
<img src={% static value.0.2.goods_img_url.0 %} alt="">
<div>{{ value.0.2.goods_description }}</div>
<p>价格:{{ value.0.2.goods_now_price }}</p>
<p>{{ value.0.2.goods_review }}</p>
</div>
<div class="tu">
<img src={% static value.0.3.goods_img_url.0 %} alt="">
<div>{{ value.0.3.goods_description }}</div>
<p>价格:{{ value.0.3.goods_now_price }}</p>
<p>{{ value.0.3.goods_review }}</p>
</div>
<div class="tu">
<img src={% static value.0.4.goods_img_url.0 %} alt="">
<div>{{ value.0.4.goods_description }}</div>
<p>价格:{{ value.0.4.goods_now_price }}</p>
<p>{{ value.0.4.goods_review }}</p>
</div>
<div class="tu">
<img src={% static value.0.5.goods_img_url.0 %} alt="">
<div>{{ value.0.5.goods_description }}</div>
<p>价格:{{ value.0.5.goods_now_price }}</p>
<p>{{ value.0.5.goods_review }}</p>
</div>
</div>
{% endfor %}
</div>
{% endfor %}
</div>
</div>
</body>
</html>
detail.html
<!DOCTYPE html>
{% load static %}
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Document</title>
<link rel="stylesheet" href="{% static 'css/detail_page.css' %}">
</head>
<body>
<div class="father">
<form action="{% url 'search' %}" class="page" method="get">
<input type="text" name="keyword" placeholder="请输入搜索内容" />
<button type="submit">搜索</button><!-- 搜索 -->
</form>
<ul class="biao">
<li>价格:</li>
<li>好评读:</li>
<li>出版社:</li>
<li>出版日期:</li>
</ul>
<div class="zhu">
<ul class="henglan">
<li >
<div class="paixu" id="lll">
<a href="{% url 'callback' detail_name=detail_name page_now=pp_page %}" class="onclickRed" >综合排序</a>
<a href="{% url 'sort_by_price' detail_name=detail_name page_now=pp_page %}" class="" >价格</a>
<a href="{% url 'sort_by_comment' detail_name=detail_name page_now=pp_page %}" class="" >销量</a>
</div>
</li>
<li>
</li>
</ul>
</div>
<div class="ban">
<ul class="gl" id="g1" style="display: block;"><!-- 手机列表 -->
{% for data in data_list %}
<li>
<img src="{% static data.goods_img_url.0 %}" alt="">
<p>{{data.goods_now_price}}</p>
<div>{{ data.goods_description }}</div>
<p>{{ data.goods_review }}</p>
</li>
{% endfor %}
</ul>
</div>
<div class="page1 paixu henglan" id="ll">
<a href="{% url 'next_page' detail_name=detail_name page_now=pp_page %}" class="" >下一页</a>
{# <a href="#" class="" >5</a>#}
<a href="{% url 'next_page' detail_name=detail_name page_now=4 %}" class="" >4</a>
<a href="{% url 'next_page' detail_name=detail_name page_now=3 %}" class="" >3</a>
<a href="{% url 'next_page' detail_name=detail_name page_now=2 %}" class="" >2</a>
<a href="{% url 'next_page' detail_name=detail_name page_now=1 %}" class="onclickRed" >1</a>
</div>
</div>
<script src="{% static 'js/detail_page.js' %}"></script>
</body>
</html>
detail_page.css
. * {
margin: 0;
padding: 0;
}
.father {
margin: 20px auto;
width: 1110px;
position: relative;
}
.center {
}
.page1 {
/*background-color: red;*/
/*position: absolute;
bottom: -200px;
right: 0;*/
overflow: hidden;
}
ul {
padding: 0;
}
.biao li,
.henglan li{
display: block;
width: 100%;
height: 34px;
line-height: 34px;
border: 1px dashed #eee;
color: #999;
font-size: 12px;
}
.onclickRed {
background-color: red;
color: #fff!important;
}
.henglan li {
border: 1px solid #ddd;
position: relative;
list-style: none;
}
.henglan li:first-child {
}
.henglan li .paixu {
margin: auto 5px;
/*background-color: blue;*/
position: absolute;
}
.henglan li .paixu a{
/*display: block;
float: left;*/
text-decoration: none;
border: 1px solid #ccc;
padding: 3px;
color: #333;
}
.henglan li .paixu a:hover,
.page1 a:hover {
border: 1px solid red;
color: red;
}
.ban {
width: 100%;
overflow: hidden;
position: relative;
}
.gl {
margin: 10px 0;
position: relative;
}
gl img {
width: 100%;
}
.gl li {
float: left;
text-decoration: none;
list-style: none;
margin-top: 20px;
width: 220px;
height: 310px;
border: 1px solid #ccc;
}
.gl li div {
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
.gl li:hover {
border: 1px solid red;
/*注意这里改了*/
}
.gl p ,
.gl div {
margin:5px 24px;
}
.page1 a {
height: 36px;
line-height: 36px;
padding: 0 14px;
margin-right: 5px;
font-size: 14px;
text-decoration: none;
border: 1px solid #ccc;
float: right;
}
detail_page.js
function my$(id)
{
return document.getElementById(id);
}
var pObj=my$("lll").getElementsByTagName("a");
for(var i=0;i<pObj.length;i++)
{
pObj[i].onclick=function(){
for(var j=0;j<pObj.length;j++)
{
pObj[j].className="";
}
this.className="onclickRed";
my$("g1").style.display="none";
my$("g2").style.display="none";
my$("g3").style.display="none";
var k=0;
for(var j=0;j<pObj.length;j++)
{
if(pObj[j].className=="onclickRed")
{
k=j;
break;
}
}
my$("g"+(k+1)).style.display="block";
}
}
var p1Obj=my$("ll").getElementsByTagName("a");
for(var i=0;i<p1Obj.length;i++)
{
p1Obj[i].onclick=function(){
for(var j=0;j<p1Obj.length;j++)
{
p1Obj[j].className="";
}
this.className="onclickRed";
}
}
前端是我同学些的。。。