내용 |
rm(list=ls())
setwd("D:/R_Project/Lotte/Workplace")
customer <- read.csv("Data/01.고객DEMO.txt", header=TRUE, sep=",", encoding="utf-8")
#purchase_origin <- read.csv("Data/02.구매상품TR.txt", header=TRUE, sep=",", encoding="utf-8")
product <- read.csv("Data/03.상품분류.txt", header=TRUE, sep=",", encoding="utf-8")
competion <- read.csv("Data/04.경쟁사이용.txt", header=TRUE, sep=",", encoding="utf-8")
membership <- read.csv("Data/05.멤버십여부.txt", header=TRUE, sep=",", encoding="utf-8")
channel <- read.csv("Data/06.채널이용.txt", header=TRUE, sep=",", encoding="utf-8")
#purchase <- purchase_origin[1:1000000,]
#write.csv(purchase, "Working/purchase.csv", row.names=FALSE)
#table(purchase_origin$제휴사)
#A B C D
#5770318 13338074 9379236 105402
#purchase_A <- purchase_origin[purchase_origin$제휴사=="A", ]
#purchase_B <- purchase_origin[purchase_origin$제휴사=="B", ]
#purchase_C <- purchase_origin[purchase_origin$제휴사=="C", ]
#purchase_D <- purchase_origin[purchase_origin$제휴사=="D", ]
#write.csv(purchase_A, "Working/purchase_A.csv", row.names=FALSE)
#write.csv(purchase_B, "Working/purchase_B.csv", row.names=FALSE)
#write.csv(purchase_C, "Working/purchase_C.csv", row.names=FALSE)
#write.csv(purchase_D, "Working/purchase_D.csv", row.names=FALSE)
################################
#조인(소분류 상품코드와 상품 명 매핑)
#table(purchase$제휴사) #제휴사별 코드와 상품 이름이 다름
#D제휴사의 데이터를 이용하여 테스트
purchase_D <- read.csv("Working/purchase_D.csv", header=TRUE)
purchase_D_p <- merge(purchase_D, product, by=c("제휴사","소분류코드"))
write.csv(purchase_D_p, "Working/purchase_D_p.csv", row.names=FALSE)
################################
#데이터 재 구조화를 이용한 첫 번째 추천 상품
library(reshape) #cast() 함수를 사용하기 위함, cast 함수는 melt함수의 반대
purchase_D_p <- read.csv("Working/purchase_D_p.csv", header=TRUE)
casting_data <- subset(purchase_D_p, select=c("고객번호", "소분류명"))
casting_data$value <- 1
nrow(purchase_D_p) #105402
#memory.size(TRUE)
#memory.limit(size=16000) #메모리 크기를 늘림 16G
start <- Sys.time()
casted_data <- cast(casting_data,
고객번호 ~ 소분류명,
fun=sum)
end <- Sys.time()
end-start #4.673823 secs, 6.507427 secs
ncol(casted_data) #146개 열
# <- Sys.time()
#casted_data <- cast(casting_data[1:200000,],
# 고객번호 ~ 소분류명,
# fun=sum)
#end <- Sys.time()
#end-start #1.46732 mins
#ncol(casted_data) #503개 열
recommend_product <- names(casted_data[,-1])[max.col(casted_data[,-1])]
head(recommend_product)
customer_recommend_product <- cbind(casted_data$고객번호, recommend_product)
colnames(customer_recommend_product) <- c("고객번호", "추천상품1")
colnames(customer_recommend_product)
write.csv(customer_recommend_product, "Working/D_product1.csv", row.names = FALSE)
################################
#2번째 추천 상품은 연관분석한 데이터에서 추천
#install.packages("arules")
library(arules)
purchase_D_p <- read.csv("Working/purchase_D_p.csv", header=TRUE)
single_D <- subset(purchase_D_p, select=c("영수증번호", "소분류명"))
single_D <- single_D[order(-single_D$영수증번호, single_D$소분류명),] #영수증 번호로 오름차순, 소분류명으로 내림차순 정렬
#temp$영수증번호 <- as.factor(single_D$영수증번호)
#transaction <- as(single_D, "transactions") #이렇게 하면 영수증 번호도 ...
write.table(single_D, file="Working/purchase_D_p_single.csv",
sep=",", row.names=FALSE, col.names=FALSE) #행 이름과 열 이름을 제외하고 저장
rm(single_D)
transaction <- read.transactions("Working/purchase_D_p_single.csv", sep=",", format="single", cols=c(1,2))
rules <- apriori(transaction, parameter=list(support=0.0001, confidence=0.01))
rules #set of 10551 rules
#rules.frame <- as(rules, "data.frame");
#head(rules.frame)
#inspect(rules[1:10])
#inspect(sort(rules, by="lift")[1:10]) #향상도가 가장 큰 것부터 상위 10개 연관 규칙 조회
#lhs에서 찾은 아이템으로 lift가 가장 큰 rhs를 출력
sorted_rule_by_lift <- sort(rules, by="lift")
#inspect(subset(sorted_rule_by_lift, subset = lhs %in% "핸드워시/손세정제")[1])
#temp <- inspect(subset(sorted_rule_by_lift, subset = lhs %in% "핸드워시/손세정제")[1])
#temp$rhs
#as.character(temp$rhs)
#gsub("[{}]", "", as.character(temp$rhs))
D_recommended_product <- read.csv("Working/D_product1.csv", header=TRUE )
D_recommended_product$추천상품2 <- ""
#408번째 행의 상품과 연관된 상품은 없음. 예외처리 해야 함
#as.character(D_recommended_product$추천상품[408])
#temp_rule <- subset(sorted_rule_by_lift, subset = lhs %in% as.character(D_recommended_product$추천상품1[408]))
#temp <- inspect(subset(sorted_rule_by_lift, subset = lhs %in% as.character(D_recommended_product$추천상품1[407]))[1])
for(i in 1:nrow(D_recommended_product)) {
tryCatch({
capture.output(
temp <- inspect(subset(sorted_rule_by_lift, subset = lhs %in% as.character(D_recommended_product$추천상품1[i]))[1])
) -> .null
D_recommended_product$추천상품2[i] <- gsub("[{}]", "", as.character(temp$rhs))
}, error=function(e) {})
}
write.csv(D_recommended_product, "Working/D_product12.csv", row.names = FALSE)
############################
#연관규칙 시각화
#install.packages("arulesViz")
library(arulesViz)
plot(rules)
subrules <- subset(rules, subset=lift>20)
plot(subrules, method="graph", control=list(type="items"))
plot(subrules, method="paracoord", control=list(reorder=TRUE))
plot(subrules, method="matrix3D", measure="lift")
plot(subrules, method="matrix", measure=c("lift", "confidence"))
plot(subrules, method="grouped")
################################
#3번째 상품 추천(고객 특성에 따른 상품 추천)
library(reshape) #cast() 함수를 사용하기 위함, cast 함수는 melt함수의 반대
purchase_D_p <- read.csv("Working/purchase_D_p.csv", header=TRUE)
purchase_D_p_subSet <- subset(purchase_D_p, select=c("제휴사", "고객번호", "소분류명"))
purchase_D_p_subSet <- merge(purchase_D_p_subSet, customer)
age_to_gen <- function(row) {
row = substr(row, 1, 2)
if(row == "")
return (0)
else
return (as.integer(row))
}
purchase_D_p_subSet$연령대 <- sapply(purchase_D_p_subSet$연령대, age_to_gen)
purchase_D_p_subSet <- subset(purchase_D_p_subSet, select=-고객번호)
purchase_D_p_subSet$value <- 1
start <- Sys.time()
casted_data_2 <- cast(purchase_D_p_subSet,
제휴사+성별+연령대+거주지역 ~ 소분류명,
fun=sum)
end <- Sys.time()
end-start #
ncol(casted_data_2) #149개 열
nrow(casted_data_2)
casted_data_2_sub <- subset(casted_data_2, select=-c(제휴사, 성별, 연령대, 거주지역))
recommend_product3_list <- names(casted_data_2)[max.col(casted_data_2)]
recommend_product3 <- cbind(subset(casted_data_2, select=c(제휴사, 성별, 연령대, 거주지역)),
recommend_product3_list)
names(recommend_product3)[5] <- "추천상품3"
names(recommend_product3)
D_recommended_product <- read.csv("Working/D_product12.csv", header=TRUE )
D_recommended_product <- merge(D_recommended_product, customer)
age_to_gen <- function(row) {
row = substr(row, 1, 2)
if(row == "")
return (0)
else
return (as.integer(row))
}
D_recommended_product$연령대 <- sapply(D_recommended_product$연령대, age_to_gen)
tt <- merge(D_recommended_product, recommend_product3, by=c("성별", "연령대", "거주지역"), all=T)
D_recommended_product <- subset(tt, select=c("고객번호", "추천상품1", "추천상품2", "추천상품3"))
rm(tt)
D_recommended_product <- D_recommended_product[order(D_recommended_product$고객번호),]
write.csv(D_recommended_product, "Working/D_product123.csv", row.names = FALSE)
################################
#데이터 탐색
#제휴사별 이용 횟수
(tapply(channel$이용횟수, channel$제휴사, sum))
barplot(tapply(channel$이용횟수, channel$제휴사, sum))
#A_MOBILE/APP B_MOBILE/APP B_ONLINEMALL C_MOBILE/APP C_ONLINEMALL D_MOBILE/APP
#13256 93067 8978 56 3770 609
#멤버십 별 회원 수
table(membership$멤버십명)
barplot(table(membership$멤버십명))
#다둥이 더영 롭스 하이마트
#2009 642 387 4418
#재휴사별 경쟁사 이용 건수
table(competion$제휴사, competion$경쟁사)
# A01 A02 B01 B02 C01 C02 C03 D01 D02
#A 3262 3123 0 0 0 0 0 0 0
#B 0 0 7682 5844 0 0 0 0 0
#C 0 0 0 0 1013 852 3311 0 0
#D 0 0 0 0 0 0 0 483 2589
|