基于随机梯度下降分类算法的活动推荐系统

本文基于随机梯度下降算法,构建了针对用户的活动推荐系统。

!pip install pycountry
!pip install kaggle-cli
#!kg download -u '[email protected]' -p 'frank19901104' -c event-recommendation-engine-challenge -f events.csv.gz
#!gzip -d events.csv.gz

0. 加载库

from __future__ import division, print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import datetime
import itertools
import hashlib
import locale
import pickle
import pycountry

import scipy.io as sio
import scipy.sparse as ss
import scipy.spatial.distance as ssd

from collections import defaultdict
from sklearn.preprocessing import normalize
# 授权登录
# 安装 PyDrive 操作库,该操作每个 notebook 只需要执行一次
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

def login_google_drive():
  # 授权登录,仅第一次的时候会鉴权
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  return drive
# 列出Google Drive下所有的文件
def list_file(drive):
  file_list = drive.ListFile({
  'q': "'root' in parents and trashed=false"}).GetList()
  for file1 in file_list:
    print('title: %s, id: %s, mimeType: %s' % (file1['title'], file1['id'], file1["mimeType"]))
drive = login_google_drive()
list_file(drive)
title: Colab, id: 1SUtiFjgLwfR1nyRu-80mKnRRhIa6mJHC, mimeType: application/vnd.google-apps.folder
title: card2014.pdf, id: 0B6B4WgkpMXnrc3lYVkQybFZ4WkpCSnp1SW5DRGdxMnNSblhB, mimeType: application/pdf
file_list = drive.ListFile({
  'q': "'1SUtiFjgLwfR1nyRu-80mKnRRhIa6mJHC' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s, mimeType: %s' % (file1['title'], file1['id'], file1["mimeType"]))
title: Kaggle-event recommendation, id: 1Icj4Bcc6OxMTOQ1oqp2A084146ch5jKG, mimeType: application/vnd.google-apps.folder
title: Credit Fraud Detection, id: 1WOmeGF8Muq1em9ZbeLFa-Ml221ErrBJJ, mimeType: application/vnd.google-apps.folder
title: LSTM, id: 1XhRk76UkPExDooz8oinsFlsS8e8qt8m2, mimeType: application/vnd.google-apps.folder
file_list = drive.ListFile({
  'q': "'1Icj4Bcc6OxMTOQ1oqp2A084146ch5jKG' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s, mimeType: %s' % (file1['title'], file1['id'], file1["mimeType"]))
title: kaggle_event_recommendation, id: 13D1FY3MQU548QVqC8ooZAvvy4BgG0SSI, mimeType: application/vnd.google.colaboratory
title: test.csv, id: 1Y7YXncoQuvv6z2J4_JP-hbtVVmyaJvIl, mimeType: text/csv
title: train.csv, id: 1i8FtD9G3H5B1QcmWG8yZ4w3aq12ARQcT, mimeType: text/csv
# 缓存数据到工作环境

def cache_data():
  # id 替换成上一步读取到的对应文件 id
  train_csv = drive.CreateFile({
  'id': "1i8FtD9G3H5B1QcmWG8yZ4w3aq12ARQcT"}) 
  test_csv = drive.CreateFile({
  'id': "1Y7YXncoQuvv6z2J4_JP-hbtVVmyaJvIl"})

  #这里的下载操作只是缓存,不会在你的Google Drive 目录下多下载一个文件

  train_csv.GetContentFile('train.csv', "text/csv")
  test_csv.GetContentFile('test.csv', "text/csv")

  print("缓存成功")

cache_data()
缓存成功

1. 定义数据清洗类

class DataCleaner(object):
    '''
    Common utilities for converting strings to equivalent numbers or number buckets
    '''
    def __init__(self):
        #载入locale
        self.localeIdMap = defaultdict(int)       
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1

        # 载入countries
        self.countryIdMap = defaultdict(int)
        self.ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                self.ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                self.ctryIdx['CA'] = i
        for cc in self.ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = self.ctryIdx[cc] + 1

        # 载入genderId
        self.genderIdMap = defaultdict(int, {
  'male':1, 'female':2})

    def getLocaleId(self, locstr):
        return self.localeIdMap[locstr.lower()]

    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr.lower()]

    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join([str(dttm.year), str(dttm.month)])

    def getCountryId(self, location):
        if (isinstance(location, str)
            and len(location.strip()) > 0
            and location.rfind("  ") > -1):
            return self.countryIdMap[location[location.rfind(
  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值