這一段程式可以讓同學們在動態網頁上觀察:
對於評論的:
的效果。
透過互動式的圖形, 我們也可以觀察這些效果是否會隨商業類別(508 categories)發生變化。
Sys.setlocale('LC_ALL','C')
[1] “C”
library(magrittr)
library(highcharter)
library(slam)
library(tm)
library(SnowballC)
library(RColorBrewer)
# set color palette
pals = c(brewer.pal(8,"Set2")[c(6)],
brewer.pal(8,"Dark2"),
brewer.pal(8,"Set1")[c(1)])
# load data
load('data/yelp1.rdata')
load('data/average.rdata')
load('data/empath.rdata')
load('data/dtm0.rdata')
To aviod glitches, we only analyze the business categories with more than 500 reviews.
First we do a quick hi-clustering on these (123) categories by their normolized theme scores.
# clusters of categories by themes scores
d = dist(scale(wx[CA$nrev>500,]))
hc = hclust(d,"ward.D2")
Cut them into 10 groups.
kg = cutree(hc,10)
kg = order(order(-table(kg)))[kg]
table(kg)
kg
1 2 3 4 5 6 7 8 9 10
44 23 17 10 8 6 5 4 3 3
Name the groups and combine the group_id with the category profiles.
gnames = c('Restaurant','Bar', 'Shop', 'Venue', 'Service',
'Art', 'Beauty', 'Fitness', 'Health', 'Public')
cats = cbind( cat=rownames(CA)[CA$nrev>500],
grp=factor(kg,labels=gnames),
CA[CA$nrev>500,] )
head(cats)
cat grp nbiz nrev avg.rev
Restaurants Restaurants Restaurant 4503 149319 33.15989
Food Food Restaurant 1616 35421 21.91894
Bars Bars Bar 517 32817 63.47582
Nightlife Nightlife Bar 640 28389 44.35781
American (New) American (New) Bar 341 22696 66.55718
Mexican Mexican Restaurant 630 18880 29.96825
Make a helper function for interactive charting …
# helper function
make.chart = function(bygroup=F) { # x,X,y,Y,mxBCm=,review
# overall usage and lift
N = as.numeric(length(X))
txt = sprintf("OVERALL: Usage=%.3f, Conf=%.3f, Base=%.3f, Lift=%.3f",
sum(X)/N, sum(X&Y)/sum(X), sum(Y)/N,
N*sum(Y&X)/sum(X)/sum(Y))
if(bygroup) {
mx = sapply(levels(cats$grp), function(g)
rowSums(mxBC[, as.character(cats$cat[cats$grp == g])]) > 0 )
bubbles = list(maxSize="20%",minSize=20)
df2 = aggregate(.~grp,cats[,2:4],sum) }
else {
mx = mxBC[,CA$nrev>500]
bubbles = list(maxSize="10%",minSize=10)
df2 = cats[,1:4] }
# cases
df = mx %>% apply(2,function(v) {
i = review$bid %in% rownames(mxBC)[v]
n = as.numeric(sum(i))
c( usage = sum(X[i])/n,
base = round(sum(Y[i])/n, 3),
conf = round(sum(X[i]&Y[i])/sum(X[i]), 3),
lift = n * sum(Y[i]&X[i]) / sum(X[i]) / sum(Y[i]) )
}) %>% t %>% data.frame %>% cbind(df2)
# bubble chart
tips=paste0(ifelse(bygroup, "", "<b>{point.cat}</b><br>"),
"no.rev/biz: {point.nrev} / {point.nbiz}<br>",
"conf/base: {point.conf} / {point.base}")
hchart(df,"scatter",hcaes(x=usage,y=lift,size=nrev,group=grp)) %>%
hc_title(text=sprintf("The effect of <b>%s</b> on <b>%s</b>",x,y)) %>%
hc_subtitle(text = txt) %>% hc_colors(pals) %>%
hc_tooltip(hideDelay=100,useHTML=T,pointFormat=tips) %>%
hc_plotOptions(bubble = bubbles) %>% hc_size(height=640) %>%
hc_chart(zoomType="xy") %>% hc_add_theme(hc_theme_flat())
}
y = "funny"; Y = review[, y] %>% {. > median(.)}
x = "swearing_terms"; X = scores[, x] %>% {. > median(.)}
make.chart()
Usage
= \(P[X]\) : the base probability of \(X\)Conf
= \(P[Y|X]\) : the probability of \(Y\) given \(X\)Base
= \(P[Y]\) : the base probability of \(Y\)Lift
= \(\frac{P[Y|X]}{P[Y]}\) : the lift of \(X\) on \(P[Y]\) 可以一次看一群話題,也可以將整群商業類別集合起來一起看 …
y = "funny"; Y = review[, y] %>% {. > median(.)}
x = "sexual+lust"
X = rowSums(sapply(c('sexual','lust'), # <-- 將話題放在括弧裡面
function(s) scores[,s] > median(scores[,s]))) > 0
make.chart(bygroup=TRUE)
y = "cool"; Y = review[, y] %>% {. > median(.)}
# 將字根放在括弧裡面
terms = grep("^(authen|genuin|pure|innoc|origin|true|truth)",
dtm$dimnames$Terms[1:10000], value=F)
x = "authenticity"; X = as.integer(row_sums(dtm[,terms]) > 0)
make.chart()
To analyze the effect of words, we need a new document term matrix (dtm2
).
corp = Corpus(VectorSource(review$text))
corp = tm_map(corp, content_transformer(tolower))
corp = tm_map(corp, removePunctuation)
dtm2 = DocumentTermMatrix(corp); dtm2
<<DocumentTermMatrix (documents: 215879, terms: 215488)>>
Non-/sparse entries: 15741015/46503592937
Sparsity : 100%
Maximal term length: 932
Weighting : term frequency (tf)
dtm2 = removeSparseTerms(dtm2, .999); dtm2
<<DocumentTermMatrix (documents: 215879, terms: 5230)>>
Non-/sparse entries: 14361122/1114686048
Sparsity : 99%
Maximal term length: 15
Weighting : term frequency (tf)
y = "useful"; Y = review[, y] %>% {. > median(.)}
w = which(colnames(dtm2) %in% c('but','however','nonetheless'))
x = "but..."; X = as.integer(row_sums(dtm2[, w]) > 0)
make.chart(bygroup=TRUE)
y = "useful"; Y = review[, y] %>% {. > median(.)}
x = "?"; X = grepl("?",review$text,fixed=T)
make.chart(bygroup=TRUE)
More than one Punctuation …
x = "? ..."; X = grepl("\\?|!",review$text)
make.chart()
y = "useful"; Y = review[, y] %>% {. > median(.)}
x = "anger"; X = senti[, x] %>% {. > median(.)}
make.chart()
x = "positive"; X = senti[, x] %>% {. > median(.)}
make.chart()
x = "negative"; X = senti[, x] %>% {. > median(.)}
make.chart()