This Milestone Report is about exploratory data analysis of the Capstone Project of the Data Science Coursera specialization. Coursera and SwitfKey are partnering on this project; that apply data science in the area of natural language. The project uses a large text corpus of documents to predict the next word on preceding input. The data is extracted and cleaned from files and used with the Shiny application. Here, we have some information about the corpus of data and prepare a plan to create the predictive model.
The data is from HC Corpora with access to 4 languages, but only English will be used. The dataset has three files.
- en_US.blogs.txt
- en_US.news.txt
- en_US.twitter.txt.
The data was loaded from Coursera Link to local machine and will be read from local disk.
# specify the source and destination of the download
destination_file <- "Coursera-SwiftKey.zip"
source_file <- "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# execute the download
download.file(source_file, destination_file)
# extract the files from the zip file
unzip(destination_file)
#Inspect the unzipped files
# find out which files where unzipped
unzip(destination_file, list = TRUE )
## Name Length Date
## 1 final/ 0 2014-07-22 10:10:00
## 2 final/de_DE/ 0 2014-07-22 10:10:00
## 3 final/de_DE/de_DE.twitter.txt 75578341 2014-07-22 10:11:00
## 4 final/de_DE/de_DE.blogs.txt 85459666 2014-07-22 10:11:00
## 5 final/de_DE/de_DE.news.txt 95591959 2014-07-22 10:11:00
## 6 final/ru_RU/ 0 2014-07-22 10:10:00
## 7 final/ru_RU/ru_RU.blogs.txt 116855835 2014-07-22 10:12:00
## 8 final/ru_RU/ru_RU.news.txt 118996424 2014-07-22 10:12:00
## 9 final/ru_RU/ru_RU.twitter.txt 105182346 2014-07-22 10:12:00
## 10 final/en_US/ 0 2014-07-22 10:10:00
## 11 final/en_US/en_US.twitter.txt 167105338 2014-07-22 10:12:00
## 12 final/en_US/en_US.news.txt 205811889 2014-07-22 10:13:00
## 13 final/en_US/en_US.blogs.txt 210160014 2014-07-22 10:13:00
## 14 final/fi_FI/ 0 2014-07-22 10:10:00
## 15 final/fi_FI/fi_FI.news.txt 94234350 2014-07-22 10:11:00
## 16 final/fi_FI/fi_FI.blogs.txt 108503595 2014-07-22 10:12:00
## 17 final/fi_FI/fi_FI.twitter.txt 25331142 2014-07-22 10:10:00
# inspect the data
list.files("final")
## [1] "de_DE" "en_US" "fi_FI" "ru_RU"
list.files("final/en_US")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
The corpora are contained in three separate plain-text files, out of which one is binary, for more information on this see [@newtest]. We import these files as follows.
# import the blogs and twitter datasets in text mode
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding="UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding="UTF-8")
## Warning in readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 167155 appears to contain an embedded nul
## Warning in readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 268547 appears to contain an embedded nul
## Warning in readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 1274086 appears to contain an embedded nul
## Warning in readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 1759032 appears to contain an embedded nul
# import the news dataset in binary mode
con <- file("final/en_US/en_US.news.txt", open="rb")
news <- readLines(con, encoding="UTF-8")
close(con)
rm(con)
The before we analyse the files we look at their size (presented in MegaBytes / MBs) and the number of wrods/lines in each file.
# file size (in MegaBytes/MB)
file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
## [1] 200.4242
file.info("final/en_US/en_US.news.txt")$size / 1024^2
## [1] 196.2775
file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
## [1] 159.3641
# our analysis we need two libraries.
#We analyse the lines and characters.
stri_stats_general(blogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
stri_stats_general(news)
## Lines LinesNEmpty Chars CharsNWhite
## 1010242 1010242 203223154 169860866
stri_stats_general(twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096031 134082634
set.seed(1012)
sTwitter <- sample(twitter, size = 5000, replace = TRUE)
sBlogs <- sample(blogs, size = 5000, replace = TRUE)
sNews <- sample(news, size = 5000, replace = TRUE)
sampleTotal <- c(sTwitter, sBlogs, sNews)
length(sampleTotal)
## [1] 15000
writeLines(sampleTotal, "sampleTotal.txt")
## Using the TM Package to clean the Corpus Text
textCon <- file("sampleTotal.txt")
textCorpus <- read_lines(textCon)
textCorpus <- Corpus(VectorSource(textCorpus)) # TM reading the text as lists
## Using the TM Package to clean the text
textCorpus <- tm_map(textCorpus, content_transformer(function(x) iconv(x, to="UTF-8", sub="byte")))
textCorpus <- tm_map(textCorpus, content_transformer(tolower)) # converting to lowercase
textCorpus <- tm_map(textCorpus, content_transformer(removePunctuation), preserve_intra_word_dashes=TRUE) # removing ponctuation
# Removing Profanity Words
profanityWords = file("profanity-words.txt")
textCorpus <- tm_map(textCorpus,removeWords, profanityWords)
textCorpus <- tm_map(textCorpus, content_transformer(removeNumbers)) # removing numbers
## removing URLs
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
textCorpus <- tm_map(textCorpus, content_transformer(removeURL))
textCorpus <- tm_map(textCorpus, removeWords, stopwords("english")) # removing stop words in English (a, as, at, so, etc.)
textCorpus <- tm_map(textCorpus, stripWhitespace) ## Stripping unnecessary whitespace from document
## Convert Corpus to plain text document
textCorpus <- tm_map(textCorpus, PlainTextDocument)
## data framing finalcorpus
finalCorpus <-data.frame(text=unlist(textCorpus),stringsAsFactors = FALSE)
## Tokenizer function to get unigrams
unigram <- NGramTokenizer(finalCorpus, Weka_control(min = 1, max = 1,delimiters = " \\r\\n\\t.,;:\"()?!"))
unigram <- data.frame(table(unigram))
unigram <- unigram[order(unigram$Freq,decreasing = TRUE),]
names(unigram) <- c("word1", "freq")
head(unigram)
## word1 freq
## 28174 said 1521
## 35808 will 1337
## 22938 one 1274
## 17675 just 1170
## 18987 like 1107
## 5278 can 1057
unigram$word1 <- as.character(unigram$word1)
write.csv(unigram[unigram$freq > 1,],"unigram.csv",row.names=F)
unigram <- read.csv("unigram.csv",stringsAsFactors = F)
saveRDS(unigram, file = "unigram.RData")
g1 <- ggplot(data=unigram[1:10,], aes(x = word1, y = freq))
g2 <- g1 + geom_bar(stat="identity") + coord_flip() + ggtitle("Frequently Used Words")
g3 <- g2 + geom_text(data = unigram[1:10,], aes(x = word1, y = freq, label = freq), hjust=-1, position = "identity")
g3
# Tokenizer function to get bigrams
bigram <- NGramTokenizer(finalCorpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
bigram <- data.frame(table(bigram))
bigram <- bigram[order(bigram$Freq,decreasing = TRUE),]
names(bigram) <- c("words","freq")
head(bigram)
## words freq
## 126864 new york 92
## 102977 last year 76
## 158746 right now 65
## 102966 last week 63
## 215451 years ago 62
## 68758 first time 61
bigram$words <- as.character(bigram$words)
str2 <- strsplit(bigram$words,split=" ")
bigram <- transform(bigram,
one = sapply(str2,"[[",1),
two = sapply(str2,"[[",2))
bigram <- data.frame(word1 = bigram$one,word2 = bigram$two,freq = bigram$freq,stringsAsFactors=FALSE)
## saving files
write.csv(bigram[bigram$freq > 1,],"bigram.csv",row.names=F)
# Tokenizer function to get trigrams
trigram <- NGramTokenizer(finalCorpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
trigram <- data.frame(table(trigram))
trigram <- trigram[order(trigram$Freq,decreasing = TRUE),]
names(trigram) <- c("words","freq")
head(trigram)
## words freq
## 32097 cant wait see 14
## 141291 new york times 13
## 141241 new york city 11
## 162957 president barack obama 11
## 117061 let us know 8
## 88164 gov chris christie 7
#####################
trigram$words <- as.character(trigram$words)
str3 <- strsplit(trigram$words,split=" ")
trigram <- transform(trigram,
one = sapply(str3,"[[",1),
two = sapply(str3,"[[",2),
three = sapply(str3,"[[",3))
# trigram$words <- NULL
trigram <- data.frame(word1 = trigram$one,word2 = trigram$two,
word3 = trigram$three, freq = trigram$freq,stringsAsFactors=FALSE)
# saving files
write.csv(trigram[trigram$freq > 1,],"trigram.csv",row.names=F)
# Tokenizer function to get quadgrams
quadgram <- NGramTokenizer(finalCorpus, Weka_control(min = 4, max = 4,delimiters = " \\r\\n\\t.,;:\"()?!"))
quadgram <- data.frame(table(quadgram))
quadgram <- quadgram[order(quadgram$Freq,decreasing = TRUE),]
names(quadgram) <- c("words","freq")
quadgram$words <- as.character(quadgram$words)
str4 <- strsplit(quadgram$words,split=" ")
quadgram <- transform(quadgram,
one = sapply(str4,"[[",1),
two = sapply(str4,"[[",2),
three = sapply(str4,"[[",3),
four = sapply(str4,"[[",4))
# quadgram$words <- NULL
quadgram <- data.frame(word1 = quadgram$one,
word2 = quadgram$two,
word3 = quadgram$three,
word4 = quadgram$four,
freq = quadgram$freq, stringsAsFactors=FALSE)
# saving files
write.csv(quadgram[quadgram$freq > 1,],"quadgram.csv",row.names=F)
- All the process from reading the file, cleaning and creating the n-grams is time-consuming for your computer.
- NLP uses intensive computer resource and is necessary a lot of tests get n-grams efficient keeping minimum files sizes.
- The techniques of removing words (cleaning) sometimes is not precise as we can suppose.
- Increasing the quality of n-gram tokenization could be critical to prediction accuracy at the prediction algorithm.
- Build a Shiny app to allow the user input the word to obtain a suggestion of the next word.
- Develop the prediction algorithm implemented in Shiny app.
- Prepare a pitch about the app and publish it at "shinyapps.io" server.