내용

글번호 251
작성자 heojk
작성일 2016-12-29 16:02:00
제목 Lotte 상품 추천
내용 rm(list=ls()) setwd("D:/R_Project/Lotte/Workplace") customer <- read.csv("Data/01.고객DEMO.txt", header=TRUE, sep=",", encoding="utf-8") #purchase_origin <- read.csv("Data/02.구매상품TR.txt", header=TRUE, sep=",", encoding="utf-8") product <- read.csv("Data/03.상품분류.txt", header=TRUE, sep=",", encoding="utf-8") competion <- read.csv("Data/04.경쟁사이용.txt", header=TRUE, sep=",", encoding="utf-8") membership <- read.csv("Data/05.멤버십여부.txt", header=TRUE, sep=",", encoding="utf-8") channel <- read.csv("Data/06.채널이용.txt", header=TRUE, sep=",", encoding="utf-8") #purchase <- purchase_origin[1:1000000,] #write.csv(purchase, "Working/purchase.csv", row.names=FALSE) #table(purchase_origin$제휴사) #A B C D #5770318 13338074 9379236 105402 #purchase_A <- purchase_origin[purchase_origin$제휴사=="A", ] #purchase_B <- purchase_origin[purchase_origin$제휴사=="B", ] #purchase_C <- purchase_origin[purchase_origin$제휴사=="C", ] #purchase_D <- purchase_origin[purchase_origin$제휴사=="D", ] #write.csv(purchase_A, "Working/purchase_A.csv", row.names=FALSE) #write.csv(purchase_B, "Working/purchase_B.csv", row.names=FALSE) #write.csv(purchase_C, "Working/purchase_C.csv", row.names=FALSE) #write.csv(purchase_D, "Working/purchase_D.csv", row.names=FALSE) ################################ #조인(소분류 상품코드와 상품 명 매핑) #table(purchase$제휴사) #제휴사별 코드와 상품 이름이 다름 #D제휴사의 데이터를 이용하여 테스트 purchase_D <- read.csv("Working/purchase_D.csv", header=TRUE) purchase_D_p <- merge(purchase_D, product, by=c("제휴사","소분류코드")) write.csv(purchase_D_p, "Working/purchase_D_p.csv", row.names=FALSE) ################################ #데이터 재 구조화를 이용한 첫 번째 추천 상품 library(reshape) #cast() 함수를 사용하기 위함, cast 함수는 melt함수의 반대 purchase_D_p <- read.csv("Working/purchase_D_p.csv", header=TRUE) casting_data <- subset(purchase_D_p, select=c("고객번호", "소분류명")) casting_data$value <- 1 nrow(purchase_D_p) #105402 #memory.size(TRUE) #memory.limit(size=16000) #메모리 크기를 늘림 16G start <- Sys.time() casted_data <- cast(casting_data, 고객번호 ~ 소분류명, fun=sum) end <- Sys.time() end-start #4.673823 secs, 6.507427 secs ncol(casted_data) #146개 열 # <- Sys.time() #casted_data <- cast(casting_data[1:200000,], # 고객번호 ~ 소분류명, # fun=sum) #end <- Sys.time() #end-start #1.46732 mins #ncol(casted_data) #503개 열 recommend_product <- names(casted_data[,-1])[max.col(casted_data[,-1])] head(recommend_product) customer_recommend_product <- cbind(casted_data$고객번호, recommend_product) colnames(customer_recommend_product) <- c("고객번호", "추천상품1") colnames(customer_recommend_product) write.csv(customer_recommend_product, "Working/D_product1.csv", row.names = FALSE) ################################ #2번째 추천 상품은 연관분석한 데이터에서 추천 #install.packages("arules") library(arules) purchase_D_p <- read.csv("Working/purchase_D_p.csv", header=TRUE) single_D <- subset(purchase_D_p, select=c("영수증번호", "소분류명")) single_D <- single_D[order(-single_D$영수증번호, single_D$소분류명),] #영수증 번호로 오름차순, 소분류명으로 내림차순 정렬 #temp$영수증번호 <- as.factor(single_D$영수증번호) #transaction <- as(single_D, "transactions") #이렇게 하면 영수증 번호도 ... write.table(single_D, file="Working/purchase_D_p_single.csv", sep=",", row.names=FALSE, col.names=FALSE) #행 이름과 열 이름을 제외하고 저장 rm(single_D) transaction <- read.transactions("Working/purchase_D_p_single.csv", sep=",", format="single", cols=c(1,2)) rules <- apriori(transaction, parameter=list(support=0.0001, confidence=0.01)) rules #set of 10551 rules #rules.frame <- as(rules, "data.frame"); #head(rules.frame) #inspect(rules[1:10]) #inspect(sort(rules, by="lift")[1:10]) #향상도가 가장 큰 것부터 상위 10개 연관 규칙 조회 #lhs에서 찾은 아이템으로 lift가 가장 큰 rhs를 출력 sorted_rule_by_lift <- sort(rules, by="lift") #inspect(subset(sorted_rule_by_lift, subset = lhs %in% "핸드워시/손세정제")[1]) #temp <- inspect(subset(sorted_rule_by_lift, subset = lhs %in% "핸드워시/손세정제")[1]) #temp$rhs #as.character(temp$rhs) #gsub("[{}]", "", as.character(temp$rhs)) D_recommended_product <- read.csv("Working/D_product1.csv", header=TRUE ) D_recommended_product$추천상품2 <- "" #408번째 행의 상품과 연관된 상품은 없음. 예외처리 해야 함 #as.character(D_recommended_product$추천상품[408]) #temp_rule <- subset(sorted_rule_by_lift, subset = lhs %in% as.character(D_recommended_product$추천상품1[408])) #temp <- inspect(subset(sorted_rule_by_lift, subset = lhs %in% as.character(D_recommended_product$추천상품1[407]))[1]) for(i in 1:nrow(D_recommended_product)) { tryCatch({ capture.output( temp <- inspect(subset(sorted_rule_by_lift, subset = lhs %in% as.character(D_recommended_product$추천상품1[i]))[1]) ) -> .null D_recommended_product$추천상품2[i] <- gsub("[{}]", "", as.character(temp$rhs)) }, error=function(e) {}) } write.csv(D_recommended_product, "Working/D_product12.csv", row.names = FALSE) ############################ #연관규칙 시각화 #install.packages("arulesViz") library(arulesViz) plot(rules) subrules <- subset(rules, subset=lift>20) plot(subrules, method="graph", control=list(type="items")) plot(subrules, method="paracoord", control=list(reorder=TRUE)) plot(subrules, method="matrix3D", measure="lift") plot(subrules, method="matrix", measure=c("lift", "confidence")) plot(subrules, method="grouped") ################################ #3번째 상품 추천(고객 특성에 따른 상품 추천) library(reshape) #cast() 함수를 사용하기 위함, cast 함수는 melt함수의 반대 purchase_D_p <- read.csv("Working/purchase_D_p.csv", header=TRUE) purchase_D_p_subSet <- subset(purchase_D_p, select=c("제휴사", "고객번호", "소분류명")) purchase_D_p_subSet <- merge(purchase_D_p_subSet, customer) age_to_gen <- function(row) { row = substr(row, 1, 2) if(row == "") return (0) else return (as.integer(row)) } purchase_D_p_subSet$연령대 <- sapply(purchase_D_p_subSet$연령대, age_to_gen) purchase_D_p_subSet <- subset(purchase_D_p_subSet, select=-고객번호) purchase_D_p_subSet$value <- 1 start <- Sys.time() casted_data_2 <- cast(purchase_D_p_subSet, 제휴사+성별+연령대+거주지역 ~ 소분류명, fun=sum) end <- Sys.time() end-start # ncol(casted_data_2) #149개 열 nrow(casted_data_2) casted_data_2_sub <- subset(casted_data_2, select=-c(제휴사, 성별, 연령대, 거주지역)) recommend_product3_list <- names(casted_data_2)[max.col(casted_data_2)] recommend_product3 <- cbind(subset(casted_data_2, select=c(제휴사, 성별, 연령대, 거주지역)), recommend_product3_list) names(recommend_product3)[5] <- "추천상품3" names(recommend_product3) D_recommended_product <- read.csv("Working/D_product12.csv", header=TRUE ) D_recommended_product <- merge(D_recommended_product, customer) age_to_gen <- function(row) { row = substr(row, 1, 2) if(row == "") return (0) else return (as.integer(row)) } D_recommended_product$연령대 <- sapply(D_recommended_product$연령대, age_to_gen) tt <- merge(D_recommended_product, recommend_product3, by=c("성별", "연령대", "거주지역"), all=T) D_recommended_product <- subset(tt, select=c("고객번호", "추천상품1", "추천상품2", "추천상품3")) rm(tt) D_recommended_product <- D_recommended_product[order(D_recommended_product$고객번호),] write.csv(D_recommended_product, "Working/D_product123.csv", row.names = FALSE) ################################ #데이터 탐색 #제휴사별 이용 횟수 (tapply(channel$이용횟수, channel$제휴사, sum)) barplot(tapply(channel$이용횟수, channel$제휴사, sum)) #A_MOBILE/APP B_MOBILE/APP B_ONLINEMALL C_MOBILE/APP C_ONLINEMALL D_MOBILE/APP #13256 93067 8978 56 3770 609 #멤버십 별 회원 수 table(membership$멤버십명) barplot(table(membership$멤버십명)) #다둥이 더영 롭스 하이마트 #2009 642 387 4418 #재휴사별 경쟁사 이용 건수 table(competion$제휴사, competion$경쟁사) # A01 A02 B01 B02 C01 C02 C03 D01 D02 #A 3262 3123 0 0 0 0 0 0 0 #B 0 0 7682 5844 0 0 0 0 0 #C 0 0 0 0 1013 852 3311 0 0 #D 0 0 0 0 0 0 0 483 2589