Search
TEST 12
V následujících úlohách uvažujte následující transakce z nákupních košíků:
<latex> \begin{table}[] \begin{tabular}{cl} \multicolumn{1}{l}{\textbf{ID transakce}} & \textbf{položky} \hline 1 & párky, rohlíky, kečup 2 & párky, rohlíky 3 & párky, kola, chipsy 4 & kola, chipsy 5 & kečup, chipsy 6 & párky, chipsy, kola \end{tabular} \end{table} </latex>
V dnešním cvičení budeme hledat nejrelevantnější dokument pro zadaný dotaz.
Návod byl převzat z http://anythingbutrbitrary.blogspot.cz/2013/03/build-search-engine-in-20-minutes-or.html a upraven.
# dokumenty pro simulaci text miningu doc1 <- "Stray cats are running all over the place. I see 10 a day!" doc2 <- "Cats are killers. They kill billions of animals a year." doc3 <- "The best food in Columbus, OH is the North Market." doc4 <- "Brand A is the best tasting cat food around. Your cat will love it." doc5 <- "Buy Brand C cat food for your cat. Brand C makes healthy and happy cats." doc6 <- "The Arnold Classic came to town this weekend. It reminds us to be healthy." doc7 <- "I have nothing to say. In summary, I have told you nothing." doc.list <- list(doc1, doc2, doc3, doc4, doc5, doc6, doc7) N.docs <- length(doc.list) names(doc.list) <- paste0("doc", c(1:N.docs)) query <- "Healthy cat food" # transformace vsech pismen na mala doc.list <- sapply(doc.list, tolower) query <- tolower(query)
Pro práci s texty musíte mít v R nainstalovány dvě knihovny - tm (text mining) a SnowballC
install.packages("tm") install.packages("SnowballC") library(tm) library(SnowballC)
my.docs <- VectorSource(c(doc.list, query)) my.docs$Names <- c(names(doc.list), "query") my.corpus <- Corpus(my.docs) my.corpus getTransformations() print(doc.list[1]) #smazani interpunkce, cisel my.corpus <- tm_map(my.corpus, removePunctuation) my.corpus <- tm_map(my.corpus, removeNumbers) print(my.corpus[[1]]$content) #ziskani korenu slov - running -> run my.corpus <- tm_map(my.corpus, stemDocument) print(my.corpus[[1]]$content) #smazani "stop words" stop <- stopwords("en") my.corpus <- tm_map(my.corpus, removeWords, words=stop) print(my.corpus[[1]]$content) #smazani mezer my.corpus <- tm_map(my.corpus, stripWhitespace) print(my.corpus[[1]]$content)
term.doc.matrix.stm <- TermDocumentMatrix(my.corpus) dim(term.doc.matrix.stm) inspect(term.doc.matrix.stm[0:14, ]) term.doc.matrix <- as.matrix(term.doc.matrix.stm)
get.tf.idf.weights <- function(tf.vec, df) { # Computes tfidf weights from a term frequency vector and a document # frequency scalar # example: get.tf.idf.weights(c(1, 2, 3, 0, 0, 6), 4)) weight = rep(0, length(tf.vec)) # tf * idf weight[tf.vec > 0] = (1 + log2(tf.vec[tf.vec > 0])) * log2(N.docs/df) weight } get.weights.per.term.vec <- function(tfidf.row) { term.df <- sum(tfidf.row[1:N.docs] > 0) tf.idf.vec <- get.tf.idf.weights(tfidf.row, term.df) return(tf.idf.vec) } tfidf.matrix <- t(apply(term.doc.matrix, c(1), FUN = get.weights.per.term.vec)) colnames(tfidf.matrix) <- colnames(term.doc.matrix) tfidf.matrix[0:3, ]
<latex>
\mathbf{a}\cdot\mathbf{b} =\left\|\mathbf{a}\right\|\left\|\mathbf{b}\right\|\cos\theta
</latex>
Jak cosine similarity funguje pro různé úhly, si ukážeme pomocí následujícího obrázku.
angle <- seq(-pi, pi, by = pi/16) plot(cos(angle) ~ angle, type = "b", xlab = "angle in radians", main = "Cosine similarity by angle")
Normalizace vektorů, pro snadnější výpočet <latex> \cos \theta = \mathbf{a}\cdot\mathbf{b} </latex>
tfidf.matrix <- scale(tfidf.matrix, center = FALSE, scale = sqrt(colSums(tfidf.matrix^2))) tfidf.matrix[0:3, ]
Výpočet podobnosti
#rozdeleni matice na vektor dotazu a vektory dokumentu query.vector <- tfidf.matrix[, (N.docs + 1)] tfidf.matrix <- tfidf.matrix[, 1:N.docs] #maticove nasobeni doc.scores <- t(query.vector) %*% tfidf.matrix #vytvoreni vystupu a jeho serazeni podle skore results.df <- data.frame(doc = names(doc.list), score = t(doc.scores), text = unlist(doc.list)) results.df <- results.df[order(results.df$score, decreasing = TRUE), ] #vysledek options(width = 2000) print(results.df, row.names = FALSE, right = FALSE, digits = 2)