目录
2.4.1 com.qf.bigata.transformer.ItemBaseFeatureModelData
2.4.2 com/qf/bigata/transformer/ItemCFModelData.scala
2.4.3 com/qf/bigata/transformer/LRModelData.scala
2.4.4 com/qf/bigata/transformer/ModelData.scala
2.4.5 com/qf/bigata/transformer/UnionFeatureModelData.scala
2.4.6 com/qf/bigata/transformer/UserBaseFeatureModelData.scala
2.5.1 com.qf.bigata.udfs.FeatureUDF
2.6.2 com.qf.bigata.utils.HBaseUtils
2.7.2 com/qf/bigata/AlsModelData.scala
2.7.3 com.qf.bigata.ArticleEmbedding
2.7.4 com.qf.bigata.transformer.ArticleEmbeddingModelData
2.7.5 com.qf.bigata.ItemBaseFeature
2.7.9 com.qf.bigata.UserBaseFeature
2.8.2 org.jpmml.sparkml.feature.StringVectorConverter
3.2.1com.qf.bigdata.dao.impl.HBaseDaoImpl
3.2.2com.qf.bigdata.dao.impl.MilvusDaoImpl
3.2.3 com.qf.bigdata.dao.impl.PrestoDaoImpl
3.2.4com.qf.bigdata.dao.DataSourceConfig
3.2.5com.qf.bigdata.dao.HBaseConfig
3.2.6com.qf.bigdata.dao.HBaseDao
3.2.7 com.qf.bigdata.dao.MilvusConfig
3.2.8 com.qf.bigdata.dao.MilvusDao
3.2.9 com.qf.bigdata.dao.PrestoDao
3.3.1 com.qf.bigdata.pojo.DauPredictInfo
3.3.2 com.qf.bigdata.pojo.HBaseProperties
3.3.3 com.qf.bigdata.pojo.MilvusProperties
3.3.5 com.qf.bigdata.pojo.RecommendInfo
3.3.6 com.qf.bigdata.pojo.RecommendResult
3.3.7 com.qf.bigdata.pojo.RetentionCurvelInfo
3.3.8 com.qf.bigdata.pojo.Sample
3.3.9 com.qf.bigdata.pojo.UserEmbeddingInfo
3.3.10 com.qf.bigdata.pojo.UserEmbeddingResult
3.4.1com.qf.bigdata.service.impl.RecommendServiceImpl
3.4.2 com.qf.bigdata.service.impl.RetentionServiceImpl
3.4.3 com.qf.bigdata.service.impl.UserEmbeddingServiceImpl
3.4.4com.qf.bigdata.service.RecommendService
3.4.5 com.qf.bigdata.service.RetentionService
3.4.6 com.qf.bigdata.service.UserEmbeddingService
3.5.1com.qf.bigdata.utils.HBaseUtils
3.5.2 com.qf.bigdata.utils.Leastsq
3.5.3 com.qf.bigdata.utils.MilvusUtils
3.5.5 com.qf.bigdata.utils.TimeUtils
3.6.1 com.qf.bigdata.web.controller.DauController
3.6.2 com.qf.bigdata.web.controller.RecommendController
3.6.3com.qf.bigdata.web.controller.UserEmbeddingController
3.6.4 com.qf.bigdata.Application
3.6.6 com.qf.bigdata.TomcatConfig
其实这个项目字数太多了,博客都以及上升到十三万字数。类的话也有几十个类。
背景指路
项目四:使用SparkSQL开发的简易推荐系统_林柚晞的博客-CSDN博客_spark推荐系统开发案例
我摊牌了我只想躺平去多刷题了。现在我就把之前的做推荐系统的代码发一下以供参考
这里搞了两个召回策略,我不太熟悉ALS.。
0 pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.qf.bigdata</groupId> <artifactId>recommend-test</artifactId> <version>1.0-SNAPSHOT</version> <properties> <scala.version>2.11.12</scala.version> <play-json.version>2.3.9</play-json.version> <maven-scala-plugin.version>2.10.1</maven-scala-plugin.version> <scala-maven-plugin.version>3.2.0</scala-maven-plugin.version> <maven-assembly-plugin.version>2.6</maven-assembly-plugin.version> <spark.version>2.4.5</spark.version> <scope.type>compile</scope.type> <json.version>1.2.3</json.version> <hbase.version>1.3.6</hbase.version> <hadoop.version>2.8.1</hadoop.version> <!--compile provided--> </properties> <dependencies> <!--json 包--> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>${json.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>${spark.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>${spark.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.11</artifactId> <version>${spark.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_2.11</artifactId> <version>${spark.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.28</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> <version>1.6</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-reflect</artifactId> <version>${scala.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>com.github.scopt</groupId> <artifactId>scopt_2.11</artifactId> <version>4.0.0-RC2</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-avro_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-jdbc</artifactId> <version>2.3.7</version> <scope>${scope.type}</scope> <exclusions> <exclusion> <groupId>javax.mail</groupId> <artifactId>mail</artifactId> </exclusion> <exclusion> <groupId>org.eclipse.jetty.aggregate</groupId> <artifactId>*</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-server</artifactId> <version>${hbase.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>${hbase.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-hadoop2-compat</artifactId> <version>${hbase.version}</version> <scope>${scope.type}</scope> </dependency> <dependency> <groupId>org.jpmml</groupId> <artifactId>jpmml-sparkml</artifactId> <version>1.5.9</version> </dependency> </dependencies> <repositories> <repository> <id>alimaven</id> <url>http://maven.aliyun.com/nexus/content/groups/public/</url> <releases> <updatePolicy>never</updatePolicy> </releases> <snapshots> <updatePolicy>never</updatePolicy> </snapshots> </repository> </repositories> <build> <sourceDirectory>src/main/scala</sourceDirectory> <testSourceDirectory>src/test/</testSourceDirectory> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>3.2.4</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <shadedArtifactAttached>true</shadedArtifactAttached> <shadedClassifierName>jar-with-dependencies</shadedClassifierName> <filters> <filter> <artifact>org.jpmml:jpmml-sparkml</artifact> <excludes> <exclude>META-INF/sparkml2pmml.properties</exclude> </excludes> </filter> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> </configuration> </execution> </executions> </plugin> <!--<plugin>--> <!--<groupId>org.apache.maven.plugins</groupId>--> <!--<artifactId>maven-assembly-plugin</artifactId>--> <!--<version>${maven-assembly-plugin.version}</version>--> <!--<configuration>--> <!--<descriptorRefs>--> <!--<descriptorRef>jar-with-dependencies</descriptorRef>--> <!--</descriptorRefs>--> <!--<filters>--> <!--<filter>--> <!--</filter>--> <!--</filters>--> <!--</configuration>--> <!--<executions>--> <!--<execution>--> <!--<id>make-assembly</id>--> <!--<phase>package</phase>--> <!--<goals>--> <!--<goal>single</goal>--> <!--</goals>--> <!--</execution>--> <!--</executions>--> <!--</plugin>--> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>${scala-maven-plugin.version}</version> <executions> <!-- 先编译scala,防止 cannot find symbol --> <execution> <id>scala-compile-first</id> <phase>process-resources</phase> <goals> <goal>add-source</goal> <goal>compile</goal> </goals> </execution> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> <configuration> <args> <arg>-dependencyfile</arg> <arg>${project.build.directory}/.scala_dependencies</arg> </args> </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-archetype-plugin</artifactId> <version>2.2</version> </plugin> <plugin> <groupId>org.codehaus.mojo</groupId> <artifactId>build-helper-maven-plugin</artifactId> <version>1.8</version> <executions> <!-- Add src/main/scala to eclipse build path --> <execution> <id>add-source</id> <phase>generate-sources</phase> <goals> <goal>add-source</goal> </goals> <configuration> <sources> <source>src/main/java</source> </sources> </configuration> </execution> <!-- Add src/test/scala to eclipse build path --> <execution> <id>add-test-source</id> <phase>generate-test-sources</phase> <goals> <goal>add-test-source</goal> </goals> <configuration> <sources> <source>src/test/java</source> </sources> </configuration> </execution> </executions> </plugin> </plugins> </build> </project>
大概的项目框架
架构长这样
1.0 资源
1.1 sparkml2pmml.properties
# Features org.apache.spark.ml.feature.Binarizer = org.jpmml.sparkml.feature.BinarizerConverter org.apache.spark.ml.feature.Bucketizer = org.jpmml.sparkml.feature.BucketizerConverter org.apache.spark.ml.feature.ChiSqSelectorModel = org.jpmml.sparkml.feature.ChiSqSelectorModelConverter org.apache.spark.ml.feature.ColumnPruner = org.jpmml.sparkml.feature.ColumnPrunerConverter org.apache.spark.ml.feature.CountVectorizerModel = org.jpmml.sparkml.feature.CountVectorizerModelConverter org.apache.spark.ml.feature.IDFModel = org.jpmml.sparkml.feature.IDFModelConverter org.apache.spark.ml.feature.ImputerModel = org.jpmml.sparkml.feature.ImputerModelConverter org.apache.spark.ml.feature.IndexToString = org.jpmml.sparkml.feature.IndexToStringConverter org.apache.spark.ml.feature.Interaction = org.jpmml.sparkml.feature.InteractionConverter org.apache.spark.ml.feature.MaxAbsScalerModel = org.jpmml.sparkml.feature.MaxAbsScalerModelConverter org.apache.spark.ml.feature.MinMaxScalerModel = org.jpmml.sparkml.feature.MinMaxScalerModelConverter org.apache.spark.ml.feature.NGram = org.jpmml.sparkml.feature.NGramConverter org.apache.spark.ml.feature.OneHotEncoderModel = org.jpmml.sparkml.feature.OneHotEncoderModelConverter org.apache.spark.ml.feature.PCAModel = org.jpmml.sparkml.feature.PCAModelConverter org.apache.spark.ml.feature.RegexTokenizer = org.jpmml.sparkml.feature.RegexTokenizerConverter org.apache.spark.ml.feature.RFormulaModel = org.jpmml.sparkml.feature.RFormulaModelConverter org.apache.spark.ml.feature.SQLTransformer = org.jpmml.sparkml.feature.SQLTransformerConverter org.apache.spark.ml.feature.StandardScalerModel = org.jpmml.sparkml.feature.StandardScalerModelConverter org.apache.spark.ml.feature.StringIndexerModel = org.jpmml.sparkml.feature.StringIndexerModelConverter org.apache.spark.ml.feature.StopWordsRemover = org.jpmml.sparkml.feature.StopWordsRemoverConverter org.apache.spark.ml.feature.Tokenizer = org.jpmml.sparkml.feature.TokenizerConverter org.apache.spark.ml.feature.VectorAssembler = org.jpmml.sparkml.feature.VectorAssemblerConverter org.apache.spark.ml.feature.VectorAttributeRewriter = org.jpmml.sparkml.feature.VectorAttributeRewriterConverter org.apache.spark.ml.feature.VectorIndexerModel = org.jpmml.sparkml.feature.VectorIndexerModelConverter org.apache.spark.ml.feature.VectorSizeHint = org.jpmml.sparkml.feature.VectorSizeHintConverter org.apache.spark.ml.feature.VectorSlicer = org.jpmml.sparkml.feature.VectorSlicerConverter org.apache.spark.ml.feature.StringVector = org.jpmml.sparkml.feature.StringVectorConverter # Prediction models org.apache.spark.ml.classification.DecisionTreeClassificationModel = org.jpmml.sparkml.model.DecisionTreeClassificationModelConverter org.apache.spark.ml.classification.GBTClassificationModel = org.jpmml.sparkml.model.GBTClassificationModelConverter org.apache.spark.ml.classification.LinearSVCModel = org.jpmml.sparkml.model.LinearSVCModelConverter org.apache.spark.ml.classification.LogisticRegressionModel = org.jpmml.sparkml.model.LogisticRegressionModelConverter org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel = org.jpmml.sparkml.model.MultilayerPerceptronClassificationModelConverter org.apache.spark.ml.classification.NaiveBayesModel = org.jpmml.sparkml.model.NaiveBayesModelConverter org.apache.spark.ml.classification.RandomForestClassificationModel = org.jpmml.sparkml.model.RandomForestClassificationModelConverter org.apache.spark.ml.clustering.KMeansModel = org.jpmml.sparkml.model.KMeansModelConverter org.apache.spark.ml.regression.DecisionTreeRegressionModel = org.jpmml.sparkml.model.DecisionTreeRegressionModelConverter org.apache.spark.ml.regression.GBTRegressionModel = org.jpmml.sparkml.model.GBTRegressionModelConverter org.apache.spark.ml.regression.GeneralizedLinearRegressionModel = org.jpmml.sparkml.model.GeneralizedLinearRegressionModelConverter org.apache.spark.ml.regression.LinearRegressionModel = org.jpmml.sparkml.model.LinearRegressionModelConverter org.apache.spark.ml.regression.RandomForestRegressionModel = org.jpmml.sparkml.model.RandomForestRegressionModelConverter
1.2 core-site.xml
<?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. See accompanying LICENSE file. --> <!-- Put site-specific property overrides in this file. --> <configuration> <property> <!-- hdfs系统的唯一标识,scheme,ip,port ,内部守护进程的通信地址--> <name>fs.defaultFS</name> <value>hdfs://qianfeng01:8020</value> </property> <property> <name>hadoop.tmp.dir</name> <value>/usr/local/hadoop/tmp</value> </property> <property> <name>hadoop.proxyuser.root.hosts</name> <value>*</value> </property> <property> <name>hadoop.proxyuser.root.groups</name> <value>*</value> </property> </configuration>
1.3 hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. See accompanying LICENSE file. --> <!-- Put site-specific property overrides in this file. --> <configuration> <!-- namenode守护进程管理的元数据文件fsimage存储的位置--> <property> <name>dfs.namenode.name.dir</name> <value>file:///usr/local/hadoop/hdpdata/dfs/name</value> </property> <!-- 确定DFS数据节点应该将其块存储在本地文件系统的何处--> <property> <name>dfs.datanode.data.dir</name> <value>file:///usr/local/hadoop/hdpdata/dfs/data</value> </property> <!-- 块的副本数--> <property> <name>dfs.replication</name> <value>1</value> </property> <!-- 块的大小(128M),下面的单位是字节--> <property> <name>dfs.blocksize</name> <value>134217728</value> </property> <!-- secondarynamenode守护进程的http地址:主机名和端口号。参考守护进程布局--> <property> <name>dfs.namenode.secondary.http-address</name> <value>qianfeng01:50090</value> </property> <!-- namenode守护进程的http地址:主机名和端口号。参考守护进程布局--> <property> <name>dfs.namenode.http-address</name> <value>qianfeng01:50070</value> </property> <property> <name>dfs.namenode.name.dir</name> <value>file:///usr/local/hadoop/hdpdata/dfs/name</value> </property> <property> <name>dfs.namenode.checkpoint.dir</name> <value>file:///usr/local/hadoop/hdpdata/dfs/cname</value> </property> <property> <name>dfs.namenode.checkpoint.edits.dir</name> <value>file:///usr/local/hadoop/hdpdata/dfs/cname</value> </property> </configuration>
1.4 hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <configuration> <property> <name>javax.jdo.option.ConnectionUserName</name> <value>root</value> </property> <property> <name>javax.jdo.option.ConnectionPassword</name> <value>@Mmforu45</value> </property> <property> <name>javax.jdo.option.ConnectionURL</name> <value>jdbc:mysql://qianfeng01:3306/hive?createDatabaseIfNotExist=true</value> </property> <property> <name>javax.jdo.option.ConnectionDriverName</name> <value>com.mysql.jdbc.Driver</value> </property> <property> <n