【创新实训】数据预处理(三)

(1)定义数据库增删改查方法

# mongodb database
from pymongo import MongoClient


class Database(object):
    def __init__(self, database, address='127.0.0.1', port=27017, name=None, pwd=None):
        self.conn = MongoClient(host=address, port=port)
        self.db = self.conn[database]
        if name:
            self.db.authenticate(name, pwd)

    def insert_one(self, collection, data):
        ret = self.db[collection].insert_one(data)
        return ret.inserted_id

    def insert_many(self, collection, data):
        ret = self.db[collection].insert_many(data)
        return ret.inserted_ids

    def update(self, collection, data):
        # data format:
        # {key:[old_data,new_data]}
        data_filter = {}
        data_revised = {}
        for key in data.keys():
            data_filter[key] = data[key][0]
            data_revised[key] = data[key][1]
        return self.db[collection].update_many(data_filter, {"$set": data_revised}).modified_count

    def find(self, col, condition, column=None):
        if column is None:
            return self.db[col].find(condition)
        else:
            return self.db[col].find(condition, column)

    def find_one(self, col, filter=None, condition=None, column=None):
        return self.db[col].find_one(filter, condition)

    def delete(self, col, condition):
        return self.db[col].delete_many(filter=condition).deleted_count

(2)对details中不同来源的电影指向相同的合并为一条,送入movie集合中

import json
from collections import defaultdict
from db import Database
from setting import setting
from collections import defaultdict
from tqdm import tqdm

db = Database('movie', setting['host'], 27017, setting['username'], setting['password'])

求两个列表的交集

def intersect(list1, list2):

    return list(set(list1) & set(list2))

在douban的记录上进行添改

douban和其他来源candidates的"sourceId", “rating”, “rateNum”, “url”, "cover"放入source中
douban原本的source、sourceId就不要了
type合并,‘nameFrn’, ‘summary’, ‘directors’, ‘country’, ‘language’, 'stars’如无,使用其他来源的

def wrap(data, candidates):
//if len(candidates) > 2:
//     print(candidates)
    srcs = ["douban", "mtime", "maoyan"]
    for key in ["type"]:
        if not data.get(key):
            data[key] = []
        tmp = set(data[key])
        for item in candidates:
            if item.get(key):
                tmp |= set(item[key])
        data[key] = list(tmp)
    for key in ['nameFrn', 'summary', 'directors', 'country', 'language', 'stars']:
        if not data.get(key):
            for item in candidates:
                if item.get(key):
                    data[key] = item[key]
                    break
    key = 'year'
    if not data.get(key):
        for item in candidates:
            if item.get(key):
                data[key] = item[key][:4]
                break
    tmp = dict()
    for item in [data] + candidates:
        tmp[item["source"]] = dict()
        for key in ["sourceId", "rating", "rateNum", "url", "cover"]:
            if item.get(key):
                tmp[item["source"]][key] = item[key]
            if key in data:
                del data[key]
    data['source'] = tmp
    del douban['_id']   # 让_id重新自动生成一个
    db.insert_one('movie', douban)

判断两个记录是否指同一个电影

不同的字段可靠性也不同,有先后区别,仅当两者都有指定字段再判断
首先,name不相同,false;country没有交集,false;year不相同,false
director, writers, stars依此判断是否有交集,有,true,无,false
nameFrn相同,true

def same(doc1, doc2):
    if doc1['name'] != doc2['name']:
        return False
    for key in ['country']:
        if key in doc1 and key in doc2 and not intersect(doc1[key], doc2[key]):
            return False
    for key in ['year']:
        if key in doc1 and key in doc2 and doc1[key] != doc2[key]:
            return False
    for key in ['director', 'writers', 'stars']:
        if key in doc1 and key in doc2:
            if intersect(doc1[key], doc2[key]):
                return True
            else:
                return False
    for key in ['nameFrn']:
        if key in doc1 and key in doc2 and doc1[key] == doc2[key]:
            return True
    return False


cursor = {}
for source in ['douban', 'maoyan', 'mtime']:
    cursor[source] = db.find('details', {"source": source})

//对name相同的计数,count>1时,_id保留下来
pipeline = [{
        '$group': {
            '_id': "$name",
            'uniqueIds': {
                '$addToSet': '$_id'
            },
            'count': {
                '$sum': 1
            }
        }
    },
    {
        '$match': {
            'count': {
                '$gt': 1
            }
        }
    }
]

for group in tqdm(db.db.details.aggregate(pipeline), total=len(list(db.db.details.aggregate(pipeline)))):
    docs = defaultdict(list)
    for id in group["uniqueIds"]:
        doc = db.find_one('details', id)
        docs[doc["source"]].append(doc)

    if len(docs) == 1:
        continue

    douban_dict = {}
    res = defaultdict(list)
    for douban in docs['douban']:
        douban_dict[douban['sourceId']] = douban
        for mtime in docs['mtime']:
            if same(douban, mtime):
                res[douban['sourceId']].append(mtime)
        size = len(res[douban['sourceId']]) if douban['sourceId'] in res else 0

有多于一个的候选可能,判断same有问题

其实same可以改成返回可能性,然后取最大的,
但这种情况很少见,也就五六条,所以不改了;下同

       if size > 1:
           print(res[douban['sourceId']])
       for maoyan in docs['maoyan']:
           if same(douban, maoyan):
               res[douban['sourceId']].append(maoyan)
       if douban['sourceId'] in res and len(res[douban['sourceId']]) - size > 1:
           print(res[douban['sourceId']])

       if len(res[douban['sourceId']]) > 0:
           wrap(douban_dict[douban['sourceId']], res[douban['sourceId']])
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值