From ecbc11a572c1c150efd206eb6e9588acaa54043c Mon Sep 17 00:00:00 2001
From: markilue <745518019@qq.com>
Date: Fri, 20 May 2022 21:33:10 +0800
Subject: [PATCH] =?UTF-8?q?=E7=94=A8=E6=88=B7=E7=94=BB=E5=83=8Fexample?=
=?UTF-8?q?=E6=9B=B4=E6=96=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../user_profile/machine-learning/pom.xml | 73 +++++++
.../src/main/resources/config.properties | 11 ++
.../src/main/resources/core-site.xml | 19 ++
.../src/main/resources/hdfs-site.xml | 19 ++
.../src/main/resources/hive-site.xml | 41 ++++
.../src/main/resources/log4j.properties | 12 ++
.../userprofile/ml/pipline/MyPipeLine.scala | 183 ++++++++++++++++++
.../ml/train/StudGenderTrain.scala | 70 +++++++
Big_data_example/user_profile/pom.xml | 1 +
9 files changed, 429 insertions(+)
create mode 100644 Big_data_example/user_profile/machine-learning/pom.xml
create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/config.properties
create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/core-site.xml
create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/hdfs-site.xml
create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/hive-site.xml
create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/log4j.properties
create mode 100644 Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/pipline/MyPipeLine.scala
create mode 100644 Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/train/StudGenderTrain.scala
diff --git a/Big_data_example/user_profile/machine-learning/pom.xml b/Big_data_example/user_profile/machine-learning/pom.xml
new file mode 100644
index 0000000..c4a5d7b
--- /dev/null
+++ b/Big_data_example/user_profile/machine-learning/pom.xml
@@ -0,0 +1,73 @@
+
+
+
+ user_profile
+ com.atguigu
+ 1.0-SNAPSHOT
+
+ 4.0.0
+
+ machine-learning
+
+
+
+ common
+ com.atguigu
+ 1.0-SNAPSHOT
+
+
+
+ org.apache.spark
+ spark-mllib_2.12
+ provided
+
+
+
+
+
+
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+ 3.4.6
+
+
+
+
+ compile
+ testCompile
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-assembly-plugin
+ 3.0.0
+
+
+ jar-with-dependencies
+
+
+
+
+ make-assembly
+ package
+
+ single
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/config.properties b/Big_data_example/user_profile/machine-learning/src/main/resources/config.properties
new file mode 100644
index 0000000..d76795a
--- /dev/null
+++ b/Big_data_example/user_profile/machine-learning/src/main/resources/config.properties
@@ -0,0 +1,11 @@
+hdfs-store.path=hdfs://Ding202:8020/user_profile
+data-warehouse.dbname=gmall
+user-profile.dbname=user_profile0224
+
+# mysql
+mysql.url=jdbc:mysql://Ding202:3306/user_profile_manager_0224?characterEncoding=utf-8&useSSL=false
+mysql.username=root
+mysql.password=123456
+
+# clickhouse
+clickhouse.url=jdbc:clickhouse://Ding202:8123/user_profile0224
diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/core-site.xml b/Big_data_example/user_profile/machine-learning/src/main/resources/core-site.xml
new file mode 100644
index 0000000..e32dcfe
--- /dev/null
+++ b/Big_data_example/user_profile/machine-learning/src/main/resources/core-site.xml
@@ -0,0 +1,19 @@
+
+
+
+
+ io.compression.codecs
+
+ org.apache.hadoop.io.compress.GzipCodec,
+ org.apache.hadoop.io.compress.DefaultCodec,
+ org.apache.hadoop.io.compress.BZip2Codec,
+ org.apache.hadoop.io.compress.SnappyCodec,
+ com.hadoop.compression.lzo.LzoCodec,
+ com.hadoop.compression.lzo.LzopCodec
+
+
+
+ io.compression.codec.lzo.class
+ com.hadoop.compression.lzo.LzoCodec
+
+
diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/hdfs-site.xml b/Big_data_example/user_profile/machine-learning/src/main/resources/hdfs-site.xml
new file mode 100644
index 0000000..671652f
--- /dev/null
+++ b/Big_data_example/user_profile/machine-learning/src/main/resources/hdfs-site.xml
@@ -0,0 +1,19 @@
+
+
+
+
+ dfs.namenode.http-address
+ Ding202:9870
+
+
+
+ dfs.namenode.secondary.http-address
+ Ding204:9868
+
+
+ dfs.client.use.datanode.hostname
+ true
+
+
+
+
diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/hive-site.xml b/Big_data_example/user_profile/machine-learning/src/main/resources/hive-site.xml
new file mode 100644
index 0000000..32ece85
--- /dev/null
+++ b/Big_data_example/user_profile/machine-learning/src/main/resources/hive-site.xml
@@ -0,0 +1,41 @@
+
+
+
+
+ javax.jdo.option.ConnectionURL
+ jdbc:mysql://Ding202:3306/metastore?createDatabaseIfNotExist=true&characterEncoding=utf-8&useSSL=false
+ JDBC connect string for a JDBC metastore
+
+
+
+ javax.jdo.option.ConnectionDriverName
+ com.mysql.jdbc.Driver
+ Driver class name for a JDBC metastore
+
+
+
+ javax.jdo.option.ConnectionUserName
+ root
+ username to use against metastore database
+
+
+
+ javax.jdo.option.ConnectionPassword
+ 123456
+ password to use against metastore database
+
+
+ hive.cli.print.header
+ true
+
+
+
+ hive.cli.print.current.db
+ true
+
+
+ hive.metastore.schema.verification
+ false
+
+
+
diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/log4j.properties b/Big_data_example/user_profile/machine-learning/src/main/resources/log4j.properties
new file mode 100644
index 0000000..48f5794
--- /dev/null
+++ b/Big_data_example/user_profile/machine-learning/src/main/resources/log4j.properties
@@ -0,0 +1,12 @@
+log4j.rootLogger=error, stdout,R
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n
+
+log4j.appender.R=org.apache.log4j.RollingFileAppender
+log4j.appender.R.File=../log/agent.log
+log4j.appender.R.MaxFileSize=1024KB
+log4j.appender.R.MaxBackupIndex=1
+
+log4j.appender.R.layout=org.apache.log4j.PatternLayout
+log4j.appender.R.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%6L) : %m%n
diff --git a/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/pipline/MyPipeLine.scala b/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/pipline/MyPipeLine.scala
new file mode 100644
index 0000000..a3a6a67
--- /dev/null
+++ b/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/pipline/MyPipeLine.scala
@@ -0,0 +1,183 @@
+package com.atguigu.userprofile.ml.pipline
+
+import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
+import org.apache.spark.ml.{Pipeline, PipelineModel, Transformer}
+import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, VectorIndexer}
+import org.apache.spark.sql.DataFrame
+
+/**
+ * 为了进行训练的类
+ */
+class MyPipeLine {
+
+ // 流水线对象
+ var pipeline: Pipeline = null
+
+ // 流水线训练后模型
+ private var pipelineModel:PipelineModel=null
+
+ //流水线初始化
+ def init(): MyPipeLine ={
+
+ pipeline=new Pipeline().setStages(
+ Array(
+ createLabelIndexer(),
+ createFeatureAssemble(),
+ createFeatureIndexer(),
+ createClassifier()
+ )
+ )
+ this
+
+ }
+
+
+
+ var labelColName:String = null
+
+ var featureColNames:Array[String]=null
+
+
+
+ //// 以下为参数 ////////////////////
+ //最大分类树(用于识别连续值特征和分类特征)
+ private var maxCategories=5
+ // 最大分支数
+ private var maxBins=5
+ // 最大树深度
+ private var maxDepth=5
+ //最小分支包含数据条数
+ private var minInstancesPerNode=1
+ //最小分支信息增益
+ private var minInfoGain=0.0
+
+
+
+ def setLabelColName(labelColName: String): MyPipeLine = {
+ this.labelColName = labelColName
+ this
+ }
+
+ def setFeatureColNames(featureColNames: Array[String]): MyPipeLine = {
+ this.featureColNames = featureColNames
+ this
+ }
+
+
+ def setMaxCategories(maxCategories:Int): MyPipeLine ={
+ this.maxCategories=maxCategories
+ this
+ }
+ def setMaxBins(maxBins:Int): MyPipeLine ={
+ this.maxBins=maxBins
+ this
+ }
+ def setMaxDepth(maxDepth:Int): MyPipeLine ={
+ this.maxDepth=maxDepth
+ this
+ }
+
+ def setMinInstancesPerNode(minInstancesPerNode:Int): MyPipeLine ={
+ this.minInstancesPerNode=minInstancesPerNode
+ this
+ }
+
+ def setMinInfoGain(minInfoGain:Double): MyPipeLine ={
+ this.minInfoGain=minInfoGain
+ this
+ }
+
+
+
+ //1 创建标签索引
+ // 把标签值 转换为矢量值
+ // ('男','女') (0,1,2,3,4,....) 按照 出现概率大小次序 概率越大 矢量越小
+ //InputCol:参考答案列表
+ //OutputCol:转换为矢量值得列名,自定义
+ def createLabelIndexer(): StringIndexer = {
+
+ val indexer = new StringIndexer()
+ //设置标签索引 -> 输入列和输出列
+ indexer.setInputCol(labelColName).setOutputCol("label_index")
+ indexer
+
+ }
+
+
+ //2 创建特征集合列
+ //InputCol:特征列名
+ //OutputCol: 特征集合列名,自定义
+ def createFeatureAssemble(): VectorAssembler = {
+
+ val assembler = new VectorAssembler()
+ assembler.setInputCols(featureColNames).setOutputCol("feature_assemble")
+ assembler
+ }
+
+
+ //3 创建特征索引列
+ // 把特征集合中的原值, 变为矢量值 按照 出现概率大小次序 概率越大 矢量越小
+ // 要识别哪些是 连续值特征(线性特征) 哪些是离散特征(分类特征) 看特征个数 个数超过一个数会当做连续值
+ def createFeatureIndexer():VectorIndexer ={
+
+ val indexer = new VectorIndexer()
+ //设置最大的分类
+ indexer.setInputCol("feature_assemble").setOutputCol("feature_index").setMaxCategories(maxCategories)
+ indexer
+ }
+
+
+ //4 创建分类器 ->这里使用决策树分类器
+ def createClassifier():DecisionTreeClassifier ={
+
+ val classifier = new DecisionTreeClassifier()
+
+ classifier.setLabelCol("label_index")
+ .setFeaturesCol("feature_index")
+ .setPredictionCol("prediction_col")
+ .setImpurity("gini") //使用信息熵还是gini
+
+ classifier
+ }
+
+
+ //训练
+
+ def train(dataFrame: DataFrame):Unit ={
+ pipelineModel = pipeline.fit(dataFrame)
+ }
+
+
+ //预测
+ def predict(dataFrame: DataFrame):DataFrame ={
+ val predictedDataFrame = pipelineModel.transform(dataFrame)
+ predictedDataFrame
+
+ }
+
+
+ //打印出 完整的决策树
+ def printDecisionTree():Unit ={
+
+ val transformer: Transformer = pipelineModel.stages(3) //取第四号位置的classifier
+ val classificationModel: DecisionTreeClassificationModel = transformer.asInstanceOf[DecisionTreeClassificationModel]
+ println(classificationModel.toDebugString)
+
+ }
+
+
+ //打印出 各个特征的权值
+ def printFeatureWeights():Unit ={
+
+ val transformer: Transformer = pipelineModel.stages(3) //取第四号位置的classifier
+ val classificationModel: DecisionTreeClassificationModel = transformer.asInstanceOf[DecisionTreeClassificationModel]
+ println(classificationModel.featureImportances)
+
+ }
+
+
+
+
+
+
+}
diff --git a/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/train/StudGenderTrain.scala b/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/train/StudGenderTrain.scala
new file mode 100644
index 0000000..17301fd
--- /dev/null
+++ b/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/train/StudGenderTrain.scala
@@ -0,0 +1,70 @@
+package com.atguigu.userprofile.ml.train
+
+import com.atguigu.userprofile.ml.pipline.MyPipeLine
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.SparkSession
+
+object StudGenderTrain {
+
+
+ def main(args: Array[String]): Unit = {
+
+ //0 创建spark环境
+ val sparkConf: SparkConf = new SparkConf().setAppName("stud_gender_train_app").setMaster("local[*]")
+ val sparkSession: SparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
+
+
+
+ //1 查询数据 ->需要把数字转为矢量
+ println("查询数据...")
+ val sql =
+ s"""
+ |select uid,
+ |case hair when '长发' then 101
+ | when '短发' then 102
+ | when '板寸' then 103
+ | end as hair,
+ | height,
+ |case skirt when '是' then 11
+ | when '否' then 12
+ | end as skirt,
+ |case age when '00后' then 100
+ | when '90后' then 90
+ | when '80后' then 80
+ | end as age,
+ | gender
+ |from test.student
+ |""".stripMargin
+
+ println(sql)
+ val dataFrame = sparkSession.sql(sql)
+ println(dataFrame)
+ //2 切分数据 分成 训练集和测试集
+ println("切分数据...")
+
+ val Array(trainDF,testDF) = dataFrame.randomSplit(Array(0.8, 0.2))
+
+ //3 创建MyPipeline
+ println("创建训练...")
+ val myPineLine = new MyPipeLine().setLabelColName("gender")
+ .setFeatureColNames(Array("hair","height","skirt","age"))
+ .setMaxCategories(5)
+ .init()
+ //4 进行训练
+ println("进行训练...")
+ myPineLine.train(trainDF)
+
+ //打印决策树和权重
+ myPineLine.printDecisionTree()
+ myPineLine.printFeatureWeights()
+
+ //5 进行预测
+ println("进行预测...")
+ val predictedDataFrame = myPineLine.predict(testDF)
+ //6 打印预测结果
+ //false表示不切割
+ predictedDataFrame.show(100,false)
+
+ }
+
+}
diff --git a/Big_data_example/user_profile/pom.xml b/Big_data_example/user_profile/pom.xml
index 5bf8b88..29e980d 100644
--- a/Big_data_example/user_profile/pom.xml
+++ b/Big_data_example/user_profile/pom.xml
@@ -14,6 +14,7 @@
merge
export-clickhouse
toBitmap
+ machine-learning