Inaugural Address Analysis:Part3 Using shiny and networkD3

TF -IDF

The term frequency amd inverse document frequency factors, together give us a summary of the importance of every word. In this post, we will be using the shiny
and the networkD3 packages for our project.

server.R

library(tidyverse)
library(tidytext)
library(widyr)

library(stringr)
library(shiny)
library(networkD3)

inaug <- read.csv("inaug_speeches.csv",header=T,stringsAsFactors = F)
important_words <- inaug %>%
  unnest_tokens(word,text) %>%
  filter(str_detect(word,"^[a-z]+$")) %>%
  group_by(Name,word)%>%
  summarise(n=n()) %>%
  bind_tf_idf(word,Name,n) %>%
  group_by(Name) %>%
  arrange(desc(tf_idf)) %>%
  top_n(10)

plotTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = 10,colour = "black",hjust=0.5),
    plot.subtitle = element_text(face="italic"),
    plot.caption = element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),
    panel.grid.major = element_line("grey80", size = 0.1),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "grey80", color = "white"),
    strip.text = element_text(size=8),
    axis.title = element_text(size=5),
    axis.text = element_text(size=5),
    axis.title.x = element_text(hjust=1),
    axis.title.y = element_text(hjust=1),
    plot.background = element_blank(),
    legend.background = element_blank(),
    legend.title = element_text(colour = "black", face = "bold"),
    legend.text = element_text(colour = "black", face = "bold"),
    axis.text.x = element_text(vjust=-1,angle=90,size=10))
}

persons <- inaug %>% select(Name) %>% unique()


plot_network <- function(data,person){

  words_to_check <- important_words %>% filter(Name==person) %>% select(word)
  data_new_cor <- data %>%
    unnest_tokens(word,text) %>%
    filter(!word %in% stop_words$word)%>%
    pairwise_cor(word,X) %>%
    filter(item1 %in% words_to_check$word) %>%
    na.omit() %>%
    filter(correlation>0.8)

  item1 <- data_new_cor$item1
  item2 <- data_new_cor$item2
  n <- data_new_cor$correlation

  nodeFactors <- factor(sort(unique(c(item1, item2))))
  nodes <- data.frame(name = nodeFactors, group = 1)

  item1 <- match(item1, levels(nodeFactors)) - 1
  item2 <- match(item2, levels(nodeFactors)) - 1
  links <- data.frame(item1, item2, n)

  forceNetwork(Links = links, Nodes = nodes, Source = 'item1', 
               Target = 'item2', Value = 'n', NodeID = 'name', Group = 'group',fontSize = 25,
               colourScale = JS('force.alpha(1); force.restart(); d3.scaleOrdinal(d3.schemeCategory20);'),
               opacity = 0.85, zoom = TRUE, opacityNoHover = 0.1,
               linkWidth = networkD3::JS("function(d) { return d.correlation/5; }"))

}



compute_data <- function(updateProgress = NULL) {

  dat <- data.frame(x = numeric(0), y = numeric(0))

  for (i in 1:10) {
    Sys.sleep(0.25)


    new_row <- data.frame(x = rnorm(1), y = rnorm(1))


    if (is.function(updateProgress)) {
      text <- paste0("x:", round(new_row$x, 2), " y:", round(new_row$y, 2))
      updateProgress(detail = text)
    }


    dat <- rbind(dat, new_row)
  }

  dat
}


shinyServer(function(input, output) {



  output$force <- renderForceNetwork({



    progress <- shiny::Progress$new()
    progress$set(message = "Crunching the numbers...", value = 0)

    on.exit(progress$close())


    updateProgress <- function(value = NULL, detail = NULL) {
      if (is.null(value)) {
        value <- progress$getValue()
        value <- value + (progress$getMax() - value) / 5
      }
      progress$set(value = value, detail = detail)
    }


    compute_data(updateProgress)


    plot_network(inaug,input$var)
  })
  output$plot <- renderPlot({

    important_words %>%
      filter(Name==input$var) %>%
      ggplot(aes(x=reorder(word,tf_idf),y=tf_idf))+geom_bar(stat="identity")+
      labs(title="Top Terms By TF-IDF Value",x="Word",y="TF-IDF Value")+plotTheme()
    })

})
  • The server.R file takes care of the backend computational process.
  • This is the file that takes care of the logic and the process that runs behind the app.

  • Here we have used the forceNetwork function from the networkD3 library to display the network of the most correlated terms.

  • To avoid excessive size, the terms with the top 10 TF-IDF values are used to find out the correlated terms.

  • As the process takes some time, the shiny progress object is used .

ui.R

library(shiny)
library(tidyverse)
library(tidytext)
library(widyr)

library(stringr)
library(networkD3)

inaug <- read.csv("inaug_speeches.csv",header=T,stringsAsFactors = F)

persons <- inaug %>% select(Name) %>% unique()

shinyUI(fluidPage(


  titlePanel("Visualization of Top correlated Terms and top 10 TF IDF values "),

  sidebarLayout(
    sidebarPanel(
      tags$head(tags$style("#force{height:100vh !important;}")),
      tags$head(tags$style("#plot{height:50vh !important;}")),

      selectInput("var", 
                  label = "President's Name",
                  choices = dput(persons$Name),
                  selected = "George Washington"),
      plotOutput("plot")


    ),
    mainPanel(


      networkD3::forceNetworkOutput("force"),
      style = "border: 1px solid black;"

    )
  )
)
)
  • The above code refers to the ui.R file that takes care of the appearance of the application.

  • The ggplot2 and the networkD3 plots are separated by a solid black line of 1px thick.

  • A select box is used to choose the President’s Name.

  • A bar plot appears at the bottom of this select box which signifies the most important words used by the President in his speech.

  • The network plot, which is adjacent, talks about the most correlated terms to these top TF-IDF terms.

Video Demo

  • As the application is too large to be held on the Shiny Server, we have a video demo
  • You can run the app in your local machine by running runGist("https://gist.github.com/adhok/960e2e6912a5631e3943321a5beb7832").Please install networkD3,tidyr,tidyverse and widyrfor the same.

Useful Resources

Thanks for reading ! You can find the code and the dataset https://github.com/adhok/pip3 . Please feel free to leave a comment below.

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s