#!/usr/bin/env python # -*- coding:utf-8 -*- import os import contextlib import requests import time import re import sys from pymongo import MongoClient ''' 进入网址,获得html文件 分析提取标签,获得标签列表 对标签列表进行数据处理 使用mongodb进行存储 判断是否是网址, 否:提取图片 是:进入网址 ''' def get_html(url): response = requests.get(url) return response.text def html_analysis(res, text): html_label_list = [] for i in res: response_list = re.findall(i, text) # 将重复标签去除 response_list = list(set(response_list)) html_label_list.append(response_list) return html_label_list def label_handle(html_label_list): pass def save_to_mongo(label_data): conn = MongoClient("localhost", 27017) db = conn.label_file myset = db.base_label myset1 = db.back_label n = 0 label1 = label_data[0] for i in label1: n +=1 myset.insert({"id":n, "base_label":"{}".format(i),'datetime':time.ctime()}) label1 = label_data[1] a = 0 for i in label1: a +=1 myset1.insert({"id":a, "back_label":"{}".format(i), 'datetime':time.ctime()}) show_database(db) conn.close() def show_database(db): myset1 = db.base_label myset2 = db.back_label s = myset1.find({},{"_id":0}) print("collection counts=", s.count()) for i in s: print(i) time.sleep(2) # https://img.ugirls.tv/uploads/magazine/cover/8c85d8088c672d0e3d4a22eb74f3018a_cover_web_l.jpg # https://www.ugirls.com/Shop/Detail/Magazine-477.html # https://img.ugirls.tv/uploads/magazine/cover/016bfa12acd8867904233b3b119a7747_cover_web_l.jpg # https://www.ugirls.com/Shop/Detail/Magazine-476.html # https://img.ugirls.tv/uploads/magazine/cover/269e51b772878a239a101a38b0d2cb84_cover_web_l.jpg # https://www.ugirls.com/Shop/Detail/Magazine-475.html # alt="[U373]杜花花" def data_handle(url): # 访问网址,获得html文件 text = get_html(url) # 正则表达式字典,存储不同的分析表达式 res_dict = {"label":[r"<\w*\b",r"</\w*\b"], "pic":""} res = res_dict["label"] # 分析html文件,提取label标签 html_label_list = html_analysis(res,text) # 对标签列表进行处理 # label_data = label_handle(html_label_list) # 将处理后的标签进行存储,并显示 save_to_mongo(html_label_list) def save_to_mongodb(total_info): conn = MongoClient("localhost", 27017) db = conn.person_file myset = db.person_info for person_info in total_info: myset.insert(person_info) print("The data was stored.") def save_to_file(total_info): for i in total_info: with open(r"C:\Users\Mi\Pictures\Saved Pictures\%s.jpg"%(i["title"]), "wb") as ff: res = requests.get(i['pic']) ff.write(res.content) time.sleep(2) def get_pic(url): # 获取html文件 text = get_html(url) # 分析文件并返回一个列表 total_info = get_info(text) # 将列表进行存储 save_to_mongodb(total_info) # 将图片下载 save_to_file(total_info) def get_info(text): pattern = '''<a href="(?P<url>https://www.ugirls.com/Shop/Detail/.*)".*target="_blank"><img \
src="(?P<pic>https://img.ugirls.tv/uploads/magazine/cover/.*\.jpg)".*alt="(?P<title>.*)" /></a>''' # 将html文件以div分割,以便使用search查找, # search只能匹配一个字符串,使用循环进行查找 texts = re.split("div", text) # 使用字典 ,储存(url),(标题),(图片) total_info = [] for text in texts: one_preson = {} titles = re.search(pattern, text) # 防止search找不到数据,报错 if not titles: continue one_preson["title"] = (titles.group("title")[6:]) one_preson["url"] = (titles.group("url")) one_preson["pic"] = (titles.group("pic")) total_info.append(one_preson) # print(total_info) return total_info def create_path(pic_name): times = time.strftime("%Y-%m-%d", time.gmtime()) s = "C:\pic_spy\%s" % times if os.path.exists(s[:23]): pass else: os.makedirs(s) path = r"%s\meinv%s.jpg" % (s, n) return path def save_img(path, rs): with open(path, "wb") as ff: for data in rs.iter_content(1024): ff.write(data) if __name__ == "__main__": url = "https://www.ugirls.com" url = "https://www.ugirls.com/Content/" n = 1 for i in range(8): url = "https://www.ugirls.com/Content/Page-%d.html"%n n +=1 get_pic(url)