From ecbc11a572c1c150efd206eb6e9588acaa54043c Mon Sep 17 00:00:00 2001 From: markilue <745518019@qq.com> Date: Fri, 20 May 2022 21:33:10 +0800 Subject: [PATCH] =?UTF-8?q?=E7=94=A8=E6=88=B7=E7=94=BB=E5=83=8Fexample?= =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../user_profile/machine-learning/pom.xml | 73 +++++++ .../src/main/resources/config.properties | 11 ++ .../src/main/resources/core-site.xml | 19 ++ .../src/main/resources/hdfs-site.xml | 19 ++ .../src/main/resources/hive-site.xml | 41 ++++ .../src/main/resources/log4j.properties | 12 ++ .../userprofile/ml/pipline/MyPipeLine.scala | 183 ++++++++++++++++++ .../ml/train/StudGenderTrain.scala | 70 +++++++ Big_data_example/user_profile/pom.xml | 1 + 9 files changed, 429 insertions(+) create mode 100644 Big_data_example/user_profile/machine-learning/pom.xml create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/config.properties create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/core-site.xml create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/hdfs-site.xml create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/hive-site.xml create mode 100644 Big_data_example/user_profile/machine-learning/src/main/resources/log4j.properties create mode 100644 Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/pipline/MyPipeLine.scala create mode 100644 Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/train/StudGenderTrain.scala diff --git a/Big_data_example/user_profile/machine-learning/pom.xml b/Big_data_example/user_profile/machine-learning/pom.xml new file mode 100644 index 0000000..c4a5d7b --- /dev/null +++ b/Big_data_example/user_profile/machine-learning/pom.xml @@ -0,0 +1,73 @@ + + + + user_profile + com.atguigu + 1.0-SNAPSHOT + + 4.0.0 + + machine-learning + + + + common + com.atguigu + 1.0-SNAPSHOT + + + + org.apache.spark + spark-mllib_2.12 + provided + + + + + + + + + + + + net.alchim31.maven + scala-maven-plugin + 3.4.6 + + + + + compile + testCompile + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.0.0 + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + + + \ No newline at end of file diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/config.properties b/Big_data_example/user_profile/machine-learning/src/main/resources/config.properties new file mode 100644 index 0000000..d76795a --- /dev/null +++ b/Big_data_example/user_profile/machine-learning/src/main/resources/config.properties @@ -0,0 +1,11 @@ +hdfs-store.path=hdfs://Ding202:8020/user_profile +data-warehouse.dbname=gmall +user-profile.dbname=user_profile0224 + +# mysql +mysql.url=jdbc:mysql://Ding202:3306/user_profile_manager_0224?characterEncoding=utf-8&useSSL=false +mysql.username=root +mysql.password=123456 + +# clickhouse +clickhouse.url=jdbc:clickhouse://Ding202:8123/user_profile0224 diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/core-site.xml b/Big_data_example/user_profile/machine-learning/src/main/resources/core-site.xml new file mode 100644 index 0000000..e32dcfe --- /dev/null +++ b/Big_data_example/user_profile/machine-learning/src/main/resources/core-site.xml @@ -0,0 +1,19 @@ + + + + + io.compression.codecs + + org.apache.hadoop.io.compress.GzipCodec, + org.apache.hadoop.io.compress.DefaultCodec, + org.apache.hadoop.io.compress.BZip2Codec, + org.apache.hadoop.io.compress.SnappyCodec, + com.hadoop.compression.lzo.LzoCodec, + com.hadoop.compression.lzo.LzopCodec + + + + io.compression.codec.lzo.class + com.hadoop.compression.lzo.LzoCodec + + diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/hdfs-site.xml b/Big_data_example/user_profile/machine-learning/src/main/resources/hdfs-site.xml new file mode 100644 index 0000000..671652f --- /dev/null +++ b/Big_data_example/user_profile/machine-learning/src/main/resources/hdfs-site.xml @@ -0,0 +1,19 @@ + + + + + dfs.namenode.http-address + Ding202:9870 + + + + dfs.namenode.secondary.http-address + Ding204:9868 + + + dfs.client.use.datanode.hostname + true + + + + diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/hive-site.xml b/Big_data_example/user_profile/machine-learning/src/main/resources/hive-site.xml new file mode 100644 index 0000000..32ece85 --- /dev/null +++ b/Big_data_example/user_profile/machine-learning/src/main/resources/hive-site.xml @@ -0,0 +1,41 @@ + + + + + javax.jdo.option.ConnectionURL + jdbc:mysql://Ding202:3306/metastore?createDatabaseIfNotExist=true&characterEncoding=utf-8&useSSL=false + JDBC connect string for a JDBC metastore + + + + javax.jdo.option.ConnectionDriverName + com.mysql.jdbc.Driver + Driver class name for a JDBC metastore + + + + javax.jdo.option.ConnectionUserName + root + username to use against metastore database + + + + javax.jdo.option.ConnectionPassword + 123456 + password to use against metastore database + + + hive.cli.print.header + true + + + + hive.cli.print.current.db + true + + + hive.metastore.schema.verification + false + + + diff --git a/Big_data_example/user_profile/machine-learning/src/main/resources/log4j.properties b/Big_data_example/user_profile/machine-learning/src/main/resources/log4j.properties new file mode 100644 index 0000000..48f5794 --- /dev/null +++ b/Big_data_example/user_profile/machine-learning/src/main/resources/log4j.properties @@ -0,0 +1,12 @@ +log4j.rootLogger=error, stdout,R +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n + +log4j.appender.R=org.apache.log4j.RollingFileAppender +log4j.appender.R.File=../log/agent.log +log4j.appender.R.MaxFileSize=1024KB +log4j.appender.R.MaxBackupIndex=1 + +log4j.appender.R.layout=org.apache.log4j.PatternLayout +log4j.appender.R.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%6L) : %m%n diff --git a/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/pipline/MyPipeLine.scala b/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/pipline/MyPipeLine.scala new file mode 100644 index 0000000..a3a6a67 --- /dev/null +++ b/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/pipline/MyPipeLine.scala @@ -0,0 +1,183 @@ +package com.atguigu.userprofile.ml.pipline + +import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier} +import org.apache.spark.ml.{Pipeline, PipelineModel, Transformer} +import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, VectorIndexer} +import org.apache.spark.sql.DataFrame + +/** + * 为了进行训练的类 + */ +class MyPipeLine { + + // 流水线对象 + var pipeline: Pipeline = null + + // 流水线训练后模型 + private var pipelineModel:PipelineModel=null + + //流水线初始化 + def init(): MyPipeLine ={ + + pipeline=new Pipeline().setStages( + Array( + createLabelIndexer(), + createFeatureAssemble(), + createFeatureIndexer(), + createClassifier() + ) + ) + this + + } + + + + var labelColName:String = null + + var featureColNames:Array[String]=null + + + + //// 以下为参数 //////////////////// + //最大分类树(用于识别连续值特征和分类特征) + private var maxCategories=5 + // 最大分支数 + private var maxBins=5 + // 最大树深度 + private var maxDepth=5 + //最小分支包含数据条数 + private var minInstancesPerNode=1 + //最小分支信息增益 + private var minInfoGain=0.0 + + + + def setLabelColName(labelColName: String): MyPipeLine = { + this.labelColName = labelColName + this + } + + def setFeatureColNames(featureColNames: Array[String]): MyPipeLine = { + this.featureColNames = featureColNames + this + } + + + def setMaxCategories(maxCategories:Int): MyPipeLine ={ + this.maxCategories=maxCategories + this + } + def setMaxBins(maxBins:Int): MyPipeLine ={ + this.maxBins=maxBins + this + } + def setMaxDepth(maxDepth:Int): MyPipeLine ={ + this.maxDepth=maxDepth + this + } + + def setMinInstancesPerNode(minInstancesPerNode:Int): MyPipeLine ={ + this.minInstancesPerNode=minInstancesPerNode + this + } + + def setMinInfoGain(minInfoGain:Double): MyPipeLine ={ + this.minInfoGain=minInfoGain + this + } + + + + //1 创建标签索引 + // 把标签值 转换为矢量值 + // ('男','女') (0,1,2,3,4,....) 按照 出现概率大小次序 概率越大 矢量越小 + //InputCol:参考答案列表 + //OutputCol:转换为矢量值得列名,自定义 + def createLabelIndexer(): StringIndexer = { + + val indexer = new StringIndexer() + //设置标签索引 -> 输入列和输出列 + indexer.setInputCol(labelColName).setOutputCol("label_index") + indexer + + } + + + //2 创建特征集合列 + //InputCol:特征列名 + //OutputCol: 特征集合列名,自定义 + def createFeatureAssemble(): VectorAssembler = { + + val assembler = new VectorAssembler() + assembler.setInputCols(featureColNames).setOutputCol("feature_assemble") + assembler + } + + + //3 创建特征索引列 + // 把特征集合中的原值, 变为矢量值 按照 出现概率大小次序 概率越大 矢量越小 + // 要识别哪些是 连续值特征(线性特征) 哪些是离散特征(分类特征) 看特征个数 个数超过一个数会当做连续值 + def createFeatureIndexer():VectorIndexer ={ + + val indexer = new VectorIndexer() + //设置最大的分类 + indexer.setInputCol("feature_assemble").setOutputCol("feature_index").setMaxCategories(maxCategories) + indexer + } + + + //4 创建分类器 ->这里使用决策树分类器 + def createClassifier():DecisionTreeClassifier ={ + + val classifier = new DecisionTreeClassifier() + + classifier.setLabelCol("label_index") + .setFeaturesCol("feature_index") + .setPredictionCol("prediction_col") + .setImpurity("gini") //使用信息熵还是gini + + classifier + } + + + //训练 + + def train(dataFrame: DataFrame):Unit ={ + pipelineModel = pipeline.fit(dataFrame) + } + + + //预测 + def predict(dataFrame: DataFrame):DataFrame ={ + val predictedDataFrame = pipelineModel.transform(dataFrame) + predictedDataFrame + + } + + + //打印出 完整的决策树 + def printDecisionTree():Unit ={ + + val transformer: Transformer = pipelineModel.stages(3) //取第四号位置的classifier + val classificationModel: DecisionTreeClassificationModel = transformer.asInstanceOf[DecisionTreeClassificationModel] + println(classificationModel.toDebugString) + + } + + + //打印出 各个特征的权值 + def printFeatureWeights():Unit ={ + + val transformer: Transformer = pipelineModel.stages(3) //取第四号位置的classifier + val classificationModel: DecisionTreeClassificationModel = transformer.asInstanceOf[DecisionTreeClassificationModel] + println(classificationModel.featureImportances) + + } + + + + + + +} diff --git a/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/train/StudGenderTrain.scala b/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/train/StudGenderTrain.scala new file mode 100644 index 0000000..17301fd --- /dev/null +++ b/Big_data_example/user_profile/machine-learning/src/main/scala/com/atguigu/userprofile/ml/train/StudGenderTrain.scala @@ -0,0 +1,70 @@ +package com.atguigu.userprofile.ml.train + +import com.atguigu.userprofile.ml.pipline.MyPipeLine +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession + +object StudGenderTrain { + + + def main(args: Array[String]): Unit = { + + //0 创建spark环境 + val sparkConf: SparkConf = new SparkConf().setAppName("stud_gender_train_app").setMaster("local[*]") + val sparkSession: SparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate() + + + + //1 查询数据 ->需要把数字转为矢量 + println("查询数据...") + val sql = + s""" + |select uid, + |case hair when '长发' then 101 + | when '短发' then 102 + | when '板寸' then 103 + | end as hair, + | height, + |case skirt when '是' then 11 + | when '否' then 12 + | end as skirt, + |case age when '00后' then 100 + | when '90后' then 90 + | when '80后' then 80 + | end as age, + | gender + |from test.student + |""".stripMargin + + println(sql) + val dataFrame = sparkSession.sql(sql) + println(dataFrame) + //2 切分数据 分成 训练集和测试集 + println("切分数据...") + + val Array(trainDF,testDF) = dataFrame.randomSplit(Array(0.8, 0.2)) + + //3 创建MyPipeline + println("创建训练...") + val myPineLine = new MyPipeLine().setLabelColName("gender") + .setFeatureColNames(Array("hair","height","skirt","age")) + .setMaxCategories(5) + .init() + //4 进行训练 + println("进行训练...") + myPineLine.train(trainDF) + + //打印决策树和权重 + myPineLine.printDecisionTree() + myPineLine.printFeatureWeights() + + //5 进行预测 + println("进行预测...") + val predictedDataFrame = myPineLine.predict(testDF) + //6 打印预测结果 + //false表示不切割 + predictedDataFrame.show(100,false) + + } + +} diff --git a/Big_data_example/user_profile/pom.xml b/Big_data_example/user_profile/pom.xml index 5bf8b88..29e980d 100644 --- a/Big_data_example/user_profile/pom.xml +++ b/Big_data_example/user_profile/pom.xml @@ -14,6 +14,7 @@ merge export-clickhouse toBitmap + machine-learning