目录
线性回归原理 |
详见博文:
线性回归代码(Spark Python) |
代码里数据: 密码:acq1
# -*-coding=utf-8 -*- from pyspark import SparkConf, SparkContextsc = SparkContext('local')from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel# Load and parse the data 加载和解析数据,将每一个数转化为浮点数。每一行第一个数作为标记,后面的作为特征def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[0], values[1:])data = sc.textFile("data/mllib/ridge-data/lpsa.data")print data.collect()[0] #-0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.024....-0.864466507337306parsedData = data.map(parsePoint)print parsedData.collect()[0] #(-0.4307829,[-1.63735562648,-2.00621178481,-1.86242597251,-1.024....,-0.864466507337])# Build the model 建立模型model = LinearRegressionWithSGD.train(parsedData, iterations=1000, step=0.1)# Evaluate the model on training data 评估模型在训练集上的误差valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))MSE = valuesAndPreds \ .map(lambda vp: (vp[0] - vp[1])**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count()print("Mean Squared Error = " + str(MSE)) #Mean Squared Error = 6.32693963099# Save and load model 保存模型和加载模型model.save(sc, "pythonLinearRegressionWithSGDModel")sameModel = LinearRegressionModel.load(sc, "pythonLinearRegressionWithSGDModel")print sameModel.predict(parsedData.collect()[0].features) #-1.86583391312