內容話題與字詞對評論互動的效果

這一段程式可以讓同學們在動態網頁上觀察：

某一種(群)內容話題 (194 themes)，
某一個(群)字詞，或
某一個(群)字根
某一種情緒

對於評論的：

useful
funny
cool

的效果。

透過互動式的圖形，我們也可以觀察這些效果是否會隨商業類別(508 categories)發生變化。

Sys.setlocale('LC_ALL','C')

[1] “C”

library(magrittr)
library(highcharter)
library(slam)
library(tm)
library(SnowballC)
library(RColorBrewer)          
# set color palette
pals = c(brewer.pal(8,"Set2")[c(6)],
         brewer.pal(8,"Dark2"),
         brewer.pal(8,"Set1")[c(1)])
# load data
load('data/yelp1.rdata')
load('data/average.rdata')
load('data/empath.rdata')
load('data/dtm0.rdata')

(1) Preparation

To aviod glitches, we only analyze the business categories with more than 500 reviews.
First we do a quick hi-clustering on these (123) categories by their normolized theme scores.

# clusters of categories by themes scores
d = dist(scale(wx[CA$nrev>500,]))
hc = hclust(d,"ward.D2")

Cut them into 10 groups.

kg = cutree(hc,10)
kg = order(order(-table(kg)))[kg]
table(kg)

kg
 1  2  3  4  5  6  7  8  9 10 
44 23 17 10  8  6  5  4  3  3

Name the groups and combine the group_id with the category profiles.

gnames = c('Restaurant','Bar', 'Shop', 'Venue', 'Service', 
           'Art', 'Beauty', 'Fitness', 'Health', 'Public')
cats = cbind( cat=rownames(CA)[CA$nrev>500], 
              grp=factor(kg,labels=gnames), 
              CA[CA$nrev>500,] )
head(cats)

                          cat        grp nbiz   nrev  avg.rev
Restaurants       Restaurants Restaurant 4503 149319 33.15989
Food                     Food Restaurant 1616  35421 21.91894
Bars                     Bars        Bar  517  32817 63.47582
Nightlife           Nightlife        Bar  640  28389 44.35781
American (New) American (New)        Bar  341  22696 66.55718
Mexican               Mexican Restaurant  630  18880 29.96825

(2) Helper Function

Make a helper function for interactive charting …

# helper function
make.chart = function(bygroup=F) { # x,X,y,Y,mxBCm=,review
  # overall usage and lift
  N = as.numeric(length(X))
  txt = sprintf("OVERALL: Usage=%.3f, Conf=%.3f, Base=%.3f, Lift=%.3f", 
                sum(X)/N, sum(X&Y)/sum(X), sum(Y)/N,
                N*sum(Y&X)/sum(X)/sum(Y))

  if(bygroup) {
    mx = sapply(levels(cats$grp), function(g) 
      rowSums(mxBC[, as.character(cats$cat[cats$grp == g])]) > 0 )
    bubbles = list(maxSize="20%",minSize=20)
    df2 = aggregate(.~grp,cats[,2:4],sum) }
  else {
    mx = mxBC[,CA$nrev>500]
    bubbles = list(maxSize="10%",minSize=10)
    df2 = cats[,1:4] }

  # cases
  df = mx %>% apply(2,function(v) {
      i = review$bid %in% rownames(mxBC)[v]
      n = as.numeric(sum(i))
      c( usage = sum(X[i])/n,
         base = round(sum(Y[i])/n, 3),
         conf = round(sum(X[i]&Y[i])/sum(X[i]), 3),
         lift = n * sum(Y[i]&X[i]) / sum(X[i]) / sum(Y[i]) )
    }) %>% t %>% data.frame %>% cbind(df2)
  
  # bubble chart
  tips=paste0(ifelse(bygroup, "", "<b>{point.cat}</b><br>"),
              "no.rev/biz: {point.nrev} / {point.nbiz}<br>",
              "conf/base: {point.conf} / {point.base}")
  hchart(df,"scatter",hcaes(x=usage,y=lift,size=nrev,group=grp)) %>%
    hc_title(text=sprintf("The effect of <b>%s</b> on <b>%s</b>",x,y)) %>% 
    hc_subtitle(text = txt) %>% hc_colors(pals) %>% 
    hc_tooltip(hideDelay=100,useHTML=T,pointFormat=tips) %>% 
    hc_plotOptions(bubble = bubbles) %>% hc_size(height=640) %>% 
    hc_chart(zoomType="xy") %>% hc_add_theme(hc_theme_flat())
}

(3) Explore the Effect of Content Themes

3.1 某一種內容話題

y = "funny";  Y = review[, y] %>% {. > median(.)}
x = "swearing_terms"; X = scores[, x] %>% {. > median(.)}
make.chart()

For the entire corpus and each business categories, we calculate and display:

Usage = \(P[X]\) : the base probability of \(X\)
Conf = \(P[Y|X]\) : the probability of \(Y\) given \(X\)
Base = \(P[Y]\) : the base probability of \(Y\)
Lift = \(\frac{P[Y|X]}{P[Y]}\) : the lift of \(X\) on \(P[Y]\)

3.2 某一群內容話題

可以一次看一群話題，也可以將整群商業類別集合起來一起看 …

y = "funny"; Y = review[, y] %>% {. > median(.)}
x = "sexual+lust"
X = rowSums(sapply(c('sexual','lust'), # <-- 將話題放在括弧裡面
                   function(s) scores[,s] > median(scores[,s]))) > 0 
make.chart(bygroup=TRUE)

3.3 某一群(個)字根

y = "cool"; Y = review[, y] %>% {. > median(.)}
# 將字根放在括弧裡面
terms = grep("^(authen|genuin|pure|innoc|origin|true|truth)",
             dtm$dimnames$Terms[1:10000], value=F)
x = "authenticity"; X = as.integer(row_sums(dtm[,terms]) > 0)
make.chart()

3.4 某一群(個)字

To analyze the effect of words, we need a new document term matrix (dtm2).

corp = Corpus(VectorSource(review$text))
corp = tm_map(corp,  content_transformer(tolower))
corp = tm_map(corp, removePunctuation)
dtm2 = DocumentTermMatrix(corp); dtm2

<<DocumentTermMatrix (documents: 215879, terms: 215488)>>
Non-/sparse entries: 15741015/46503592937
Sparsity           : 100%
Maximal term length: 932
Weighting          : term frequency (tf)

dtm2 = removeSparseTerms(dtm2, .999); dtm2

<<DocumentTermMatrix (documents: 215879, terms: 5230)>>
Non-/sparse entries: 14361122/1114686048
Sparsity           : 99%
Maximal term length: 15
Weighting          : term frequency (tf)

y = "useful"; Y = review[, y] %>% {. > median(.)}
w = which(colnames(dtm2) %in% c('but','however','nonetheless'))
x = "but..."; X = as.integer(row_sums(dtm2[, w]) > 0)
make.chart(bygroup=TRUE)

3.5 標點符號

y = "useful"; Y = review[, y] %>% {. > median(.)}
x = "?"; X = grepl("?",review$text,fixed=T)
make.chart(bygroup=TRUE)

More than one Punctuation …

x = "? ..."; X = grepl("\\?|!",review$text)
make.chart()

3.6 情緒

y = "useful"; Y = review[, y] %>% {. > median(.)}
x = "anger"; X = senti[, x] %>% {. > median(.)}
make.chart()

x = "positive"; X = senti[, x] %>% {. > median(.)}
make.chart()

x = "negative"; X = senti[, x] %>% {. > median(.)}
make.chart()