All about DataSince, DataEngineering and ComputerScience
View the Project on GitHub datainsightat/DataScience_Examples
# Import the library for ALS
from pyspark.mllib.recommendation import ALS
# Load the data into RDD
data = sc.textFile(file_path)
# Split the RDD
ratings = data.map(lambda l: l.split(','))
# Transform the ratings RDD
ratings_final = ratings.map(lambda line: Rating(int(line[0]), int(line[1]), float(line[2])))
# Split the data into training and test
training_data, test_data = ratings_final.randomSplit([0.8, 0.2])
# Create the ALS model on the training data
model = ALS.train(training_data, rank=10, iterations=10)
# Drop the ratings column
testdata_no_rating = test_data.map(lambda p: (p[0], p[1]))
# Predict the model
predictions = model.predictAll(testdata_no_rating)
# Print the first rows of the RDD
predictions.take(2)
# Prepare ratings data
rates = ratings_final.map(lambda r: ((r[0], r[1]), r[2]))
# Prepare predictions data
preds = predictions.map(lambda r: ((r[0], r[1]), r[2]))
# Join the ratings data with predictions data
rates_and_preds = rates.join(preds)
# Calculate and print MSE
MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error of the model for the test data = {:.2f}".format(MSE))