This code takes more than 5 days running, but, it worked perfectly on the data test.
T1=Sys.time()
# Recommendation Program
#plus proches voisins
#install.packages("reshape2") like require(reshape2)
#require(data.table)
#require(reshape2)
#install.packages("data.table")
#library(Rserve)
library(data.table)
library(reshape2)
table1 <- read.table("base_test_fil.csv", sep=";", quote="\"", header=TRUE)
#fread(data.table)
table3=as.data.table(table1)
#aggregation + binary matrix of table1
table2 <- dcast.data.table(table3, id_customer ~ product_id,value.var="product_id", fun.aggregate=length)
table2=as.data.frame(table2)
#verification
#class(table2)
data.stat=table2
#remark: we can do directly :data.stat=table2
options(digits=4)
# Drop any column named "id_order":technique :not in
data.stat.ibs <- (data.stat[,!(names(data.stat) %in% c("id_customer"))])
# Create a placeholder dataframe listing item vs. item
data.stat.ibs.similarity <- matrix(NA, nrow=ncol(data.stat.ibs),ncol=ncol(data.stat.ibs),dimnames=list(colnames(data.stat.ibs),colnames(data.stat.ibs)))
getCosine <- function(x){
as.matrix(t(x[,c(1:ncol(x))])%*%x[,c(1:ncol(x))]/(sqrt(colSums(x^2) %*% t(colSums(x^2)))))
}
data.stat.ibs.similarity<-getCosine(as.matrix(data.stat.ibs))
# Back to dataframe
data.stat.ibs.similarity <- as.data.frame(data.stat.ibs.similarity)
#View(data.stat.ibs.similarity)
# Get the top 10 neighbours for each
data.stat.neighbours <- matrix(NA, nrow=ncol(data.stat.ibs.similarity),ncol =11,dimnames=list(colnames(data.stat.ibs.similarity)))
for(i in 1:ncol(data.stat.ibs))
{
data.stat.neighbours[i,] <- (t(head(n=11,rownames(data.stat.ibs.similarity[order(data.stat.ibs.similarity[,i],decreasing=TRUE),][i]))))
}
data=data.stat.neighbours
data=data[,-1]
colnames(data)=c("best1","best2","best3","best4","best5","best6","best7","best8","best9","best10")
#Export csv file
write.csv(data,file="export1.csv",row.names=TRUE)
#Sys.time()-T1
#####user_based_collaborative
#Lets make a helper function to calculate the scores
getScore <- function(history, similarities)
{
x <- sum(history*similarities)/sum(similarities)
x
}
# A placeholder matrix
holder <- matrix(NA,nrow=nrow(data.stat),ncol=ncol(data.stat)-1,dimnames=list((data.stat$id_customer),colnames(data.stat[-1])))
# Loop through the id_customers (rows)
for(i in 1:nrow(holder))
{
# Loops through the products (columns)
for(j in 1:ncol(holder))
{
# Get the id_customer's name and th product's name
# We do this not to conform with vectors sorted differently
id_customer <- rownames(holder)[i]
product <- colnames(holder)[j]
# We do not want to recommend products you have already consumed
#Nous ne voulons pas de recommander des produits que vous avez déjà consommés
# If you have already consumed it, we store an empty string
if(as.integer(data.stat[data.stat$id_customer==id_customer,product]) == 1)
{
holder[i,j]<-""
} else {
# We first have to get a product's top 10 neighbours sorted by similarity
topN<-((head(n=11,(data.stat.ibs.similarity[order(data.stat.ibs.similarity[,product],decreasing=TRUE),][product]))))
topN.names <- as.character(rownames(topN))
topN.similarities <- as.numeric(topN[,1])
# Drop the first one because it will always be the same product
topN.similarities<-topN.similarities[-1]
topN.names<-topN.names[-1]
# We then get the id_customer's purchase history for those 10 items
topN.purchases<- data.stat[,c("id_customer",topN.names)]
topN.id_customerPurchases<-topN.purchases[topN.purchases$id_customer==id_customer,]
topN.id_customerPurchases <- as.numeric(topN.id_customerPurchases[!(names(topN.id_customerPurchases) %in% c("id_customer"))])
# We then calculate the score for that product and that id_customer
holder[i,j]<-getScore(similarities=topN.similarities,history=topN.id_customerPurchases)
} # close else statement
} # end product for loop
} # end id_customer for loop
# Output the results to a file
data.stat.id_customer.scores <- holder
#write.csv(file="final-id_customer-scores.csv",data.stat.id_customer.scores)
# Lets make our recommendations pretty
data.stat.id_customer.scores.holder <- matrix(NA, nrow=nrow(data.stat.id_customer.scores),ncol=100,dimnames=list(rownames(data.stat.id_customer.scores)))
for(i in 1:nrow(data.stat.id_customer.scores))
{
data.stat.id_customer.scores.holder[i,] <- names(head(n=100,(data.stat.id_customer.scores[,order(data.stat.id_customer.scores[i,],decreasing=TRUE)])[i,]))
}
# Write output to file
write.csv(data.stat.id_customer.scores.holder,file="export3.csv",row.names=TRUE)
base_test_fil.csv
, or edit your question/code to contain its own test data set. – Curt F. Aug 18 at 9:30