Optimize CSV reader

Question

This code takes more than 5 days running, but, it worked perfectly on the data test.

T1=Sys.time()
# Recommendation Program
#plus proches voisins
#install.packages("reshape2") like require(reshape2)
#require(data.table)
#require(reshape2)
#install.packages("data.table")
#library(Rserve)
library(data.table)
library(reshape2)


table1 <- read.table("base_test_fil.csv", sep=";", quote="\"",  header=TRUE)
#fread(data.table)

table3=as.data.table(table1)
#aggregation + binary matrix of table1
table2 <- dcast.data.table(table3, id_customer ~ product_id,value.var="product_id", fun.aggregate=length)
table2=as.data.frame(table2)
#verification
#class(table2)
data.stat=table2

#remark: we can do directly :data.stat=table2 
options(digits=4)

# Drop any column named "id_order":technique :not in 
data.stat.ibs <- (data.stat[,!(names(data.stat) %in% c("id_customer"))])

# Create a placeholder dataframe listing item vs. item
data.stat.ibs.similarity  <- matrix(NA, nrow=ncol(data.stat.ibs),ncol=ncol(data.stat.ibs),dimnames=list(colnames(data.stat.ibs),colnames(data.stat.ibs)))

getCosine <- function(x){
   as.matrix(t(x[,c(1:ncol(x))])%*%x[,c(1:ncol(x))]/(sqrt(colSums(x^2) %*% t(colSums(x^2)))))
}

data.stat.ibs.similarity<-getCosine(as.matrix(data.stat.ibs))

# Back to dataframe
data.stat.ibs.similarity <- as.data.frame(data.stat.ibs.similarity)
#View(data.stat.ibs.similarity)

# Get the top 10 neighbours for each
data.stat.neighbours <- matrix(NA, nrow=ncol(data.stat.ibs.similarity),ncol =11,dimnames=list(colnames(data.stat.ibs.similarity)))

for(i in 1:ncol(data.stat.ibs)) 
{
    data.stat.neighbours[i,] <- (t(head(n=11,rownames(data.stat.ibs.similarity[order(data.stat.ibs.similarity[,i],decreasing=TRUE),][i]))))
}

data=data.stat.neighbours
data=data[,-1]
colnames(data)=c("best1","best2","best3","best4","best5","best6","best7","best8","best9","best10")

#Export csv file 
write.csv(data,file="export1.csv",row.names=TRUE)
#Sys.time()-T1
#####user_based_collaborative
#Lets make a helper function to calculate the scores
getScore <- function(history, similarities)
{
  x <- sum(history*similarities)/sum(similarities)
  x
}

# A placeholder matrix
holder <- matrix(NA,nrow=nrow(data.stat),ncol=ncol(data.stat)-1,dimnames=list((data.stat$id_customer),colnames(data.stat[-1])))

# Loop through the id_customers (rows)
for(i in 1:nrow(holder)) 
{
  # Loops through the products (columns)
  for(j in 1:ncol(holder)) 
  {
    # Get the id_customer's name and th product's name
    # We do this not to conform with vectors sorted differently 
    id_customer <- rownames(holder)[i]
     product <- colnames(holder)[j]

     # We do not want to recommend products you have already consumed
     #Nous ne voulons pas de recommander des produits que vous avez déjà consommés
     # If you have already consumed it, we store an empty string
     if(as.integer(data.stat[data.stat$id_customer==id_customer,product]) == 1)
     { 
       holder[i,j]<-""
     } else {
       # We first have to get a product's top 10 neighbours sorted by similarity
       topN<-((head(n=11,(data.stat.ibs.similarity[order(data.stat.ibs.similarity[,product],decreasing=TRUE),][product]))))
       topN.names <- as.character(rownames(topN))
       topN.similarities <- as.numeric(topN[,1])

       # Drop the first one because it will always be the same product
       topN.similarities<-topN.similarities[-1]
       topN.names<-topN.names[-1]

       # We then get the id_customer's purchase history for those 10 items
       topN.purchases<- data.stat[,c("id_customer",topN.names)]
       topN.id_customerPurchases<-topN.purchases[topN.purchases$id_customer==id_customer,]
       topN.id_customerPurchases <- as.numeric(topN.id_customerPurchases[!(names(topN.id_customerPurchases) %in% c("id_customer"))])

       # We then calculate the score for that product and that id_customer
       holder[i,j]<-getScore(similarities=topN.similarities,history=topN.id_customerPurchases)

    } # close else statement
  } # end product for loop   
} # end id_customer for loop

# Output the results to a file
data.stat.id_customer.scores <- holder
#write.csv(file="final-id_customer-scores.csv",data.stat.id_customer.scores)

# Lets make our recommendations pretty
data.stat.id_customer.scores.holder <- matrix(NA,   nrow=nrow(data.stat.id_customer.scores),ncol=100,dimnames=list(rownames(data.stat.id_customer.scores)))
for(i in 1:nrow(data.stat.id_customer.scores)) 
{
  data.stat.id_customer.scores.holder[i,] <- names(head(n=100,(data.stat.id_customer.scores[,order(data.stat.id_customer.scores[i,],decreasing=TRUE)])[i,]))
}

# Write output to file
write.csv(data.stat.id_customer.scores.holder,file="export3.csv",row.names=TRUE)

please make your title more instructive of what your code is doing — morbidCode, Aug 18 at 9:14
This code doesn't run. You need to point us to base_test_fil.csv, or edit your question/code to contain its own test data set. — Curt F., Aug 18 at 9:30
@CurtF. it seems your comment is compelling others to vote to close as "broken code". It isn't obviously broken code to me, and the OP says it works for small data set but too slow for a bigger one. Closing as "broken code" is not right. To the OP: it would help greatly if you could include a sample data set, something big enough to demonstrate the slowness, so that reviewers can play with it and test suggested improvements before actually suggesting them — janos♦, Aug 18 at 10:25
@joo it would help greatly if you could include a sample data set, something big enough to demonstrate the slowness, so that reviewers can play with it and test suggested improvements before actually suggesting them — janos♦, Aug 18 at 11:51

asked	18 days ago
viewed	51 times

current community

your communities

more stack exchange communities

Optimize CSV reader

Your Answer

Browse other questions tagged performance r or ask your own question.

Hot Network Questions

current community

your communities

more stack exchange communities

Optimize CSV reader

Know someone who can answer? Share a link to this question via email, Google+, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Browse other questions tagged performance r or ask your own question.

Related

Hot Network Questions