Concordancing with R

In this post I am going to exemplify how you can use R as a concordancing tool. You can download the R script for the function ConcR here and use it for creating concordances with R.

Using R as a concordance tool has many advantages over other ready-made concordance applications such as WordSmith, AntConc, or MonoConc.

I think R outperforms such concordance applications when creating KWICs (Keyword in Context) for at least three main reasons:

  • By writing your own script, you gain a better understanding of what you have actually searched for and what you may have missed.
  • More importantly, however, is that by using R and saving your script you render your analysis 100% transparent and reproducible. By this I mean that other researchers can reproduce your analysis step by step and check for possible mistakes, blunders or outright forgery.
  • Last but not least, using scripts will save you a lot of time in the long run. It may be time consuming at first, but in the need it will be much more time efficient to extract matches because you can re-use parts of your script.

To use R for creating concordances, we will write a function and then apply the function to a sample cropus which only contains three short files.

But now, let’s start…

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
##################################################################
# Start defining function
# without output saved on computer
ConcR <- function(pathname, search.pattern, range, exact = FALSE, all.pre = FALSE) {
# Install required packages
  #install.packages("plyr")
  #install.packages("data.table")
  #install.packages("stringr")
  #install.packages("tm")
  # Load packages
  require("plyr")
  require("data.table")
  require("stringr")
  require("tm")
###############################################################
# load file IDs
corpus.files = list.files(path = pathname, pattern = NULL, all.files = T,
  full.names = T, recursive = T, ignore.case = T, include.dirs = T)
# modify search pattern to extract from space to space
  search.pattern.new <- paste("qwertz", search.pattern, "asdfgh", collapse = "")
  search.pattern.new <- gsub("qwertz ", "qwertz", search.pattern.new, fixed = T)
  search.pattern.new <- gsub(" asdfgh", "asdfgh", search.pattern.new, fixed = T)
  search.pattern.new <- gsub("qwertz", "[A-Z]{0,1}[a-z]*", search.pattern.new, fixed = T)
  search.pattern.new <- gsub("asdfgh", "[a-z]*", search.pattern.new, fixed = T)
# implement new search pattern (if exact = false)
  ifelse(exact == TRUE, search.pattern <- search.pattern, search.pattern <- search.pattern.new)
###############################################################
# Tokenize the corpus files
corpus.tmp <- sapply(corpus.files, function(x) {
  x <- scan(x, what = "char", quiet = T)
  x <- paste(x, collapse = " ")
#  x <- unlist(strsplit(x, " ")) # activate for word concordance (if you are looking for individual words)
  x <- gsub(" {2,}" , " ", x)
  x <- str_trim(x, side = "both")
  }  )
# Extract the positions of the tokens
concordance.index <- sapply(corpus.tmp, function(x)  {
  x <- str_locate_all(x, search.pattern)  }  )
###############################################################
# Extract tokens
token <- sapply(corpus.tmp, function(x)  {
  x <- str_extract_all(x, search.pattern)  }  )
# clean tokens
token <- sapply(token, function(x)  {
  x <- str_trim(x, side = "both")  }  )
###############################################################
# Extract subsequent elements (limited)
post <- sapply(corpus.tmp, function(file) {
  conc.index <- sapply(file, function(y)  {
    str_locate_all(y, search.pattern)  }  )
  start <- as.vector(sapply(conc.index, function(a){
    a <- as.numeric(a[, "end"])
    a <- as.numeric(a) + 1
    }  )  )
  end <- as.vector(sapply(conc.index, function(b){
    b <- as.numeric(b[, "end"])
    b <- as.numeric(b) + 1
    b <- as.numeric(b) + range
    }  )  )
  positions <- cbind(start, end)
  sapply(seq_along(file), function(i) {
    str_sub(file[i], positions[ ,"start"], positions[ ,"end"])
    }  )
  }  )
###############################################################
# Extract subsequent elements (limited)
pre <- sapply(corpus.tmp, function(file) {
  conc.index <- sapply(file, function(y)  {
    str_locate_all(y, search.pattern)  }  )
  start <- as.vector(sapply(conc.index, function(a){
    a <- as.numeric(a[, "start"])
    a <- ifelse(as.numeric(a) - range < 0, 1, as.numeric(a) - range)
    }  )  )
  end <- as.vector(sapply(conc.index, function(b){
    b <- as.numeric(b[, "start"])
    b <- as.numeric(b) - 1
    }  )  )
  positions <- cbind(start, end)
  sapply(seq_along(file), function(i) {
    str_sub(file[i], positions[ ,"start"], positions[ ,"end"])
    }  )
  }  )
###############################################################
# Extract subsequent elements (limited)
pre.all <- sapply(corpus.tmp, function(file) {
  conc.index <- sapply(file, function(y)  {
    str_locate_all(y, search.pattern)  }  )
  start <- as.vector(sapply(conc.index, function(a){
    a <- as.numeric(a[, "start"])
    a <- 1
    }  )  )
  end <- as.vector(sapply(conc.index, function(b){
    b <- as.numeric(b[, "start"])
    b <- as.numeric(b) - 1
    }  )  )
  positions <- cbind(start, end)
  sapply(seq_along(file), function(i) {
    str_sub(file[i], positions[ ,"start"], positions[ ,"end"])
    }  )
  }  )
###############################################################
text.id <- as.vector(unlist(sapply(names(token), function(x) {
  x <- gsub(".*/", "", x)
  x <- gsub("\\ .*", "", x)
  x <- gsub("\\.TXT", "", x)
  x <- gsub("\\.txt", "", x) } )))
len <- as.vector(unlist(sapply(token, function(x) {
  x <- length(x)} )))
text.id <- rep(text.id, len)
###############################################################
### Vectorize lists
# vectorize tokens
token <- as.vector(unlist(token))
# vectorize pre
pre <- as.vector(unlist(pre))
# vectorize post
post <- as.vector(unlist(post))
# vectorize pre.all
pre.all <- as.vector(unlist(pre.all))
###############################################################
# Create a vector out of the clean corpus material surrounding the match
asone.tb <- matrix(cbind(pre, rep("<<", length(pre)), token, rep(">>", length(pre)), post), ncol = 5)
asone <- apply(asone.tb, 1, paste, collapse = " ")
asone <-  gsub(" {2,}", " ", asone)
asone <- str_trim(asone, side = "both")
###############################################################
# Create a table of the extracted information
redux <- data.frame(1:length(text.id), text.id, pre, token, post, asone)
colnames(redux) <- c("id", "text.id", "previous element(s)","token","following element(s)", "as one")
full <- data.frame(1:length(text.id), text.id, pre, token, post, pre.all, asone)
colnames(full) <- c("id", "text.id", "previous element(s)","token","following element(s)", "previous context", "as one")
ifelse(all.pre == FALSE, kwic.tmp <- redux, kwic.tmp <- full)
###############################################################
# Create txt file in which we store the results
#output.file = file.create(outputpath, showWarnings = F)
# Store the txt file in the output file
#write.table(kwic.tmp, outputpath, sep = "\t", row.names = F)
# Return
return(list(kwic.tmp))
# End function
  }
###############################################################
### ---                    THE END
###############################################################

Let’s try out our function with an example…

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
###############################################################
# Example
pathname <- "C:\\03-MyProjects\\ConcR\\TestCorpus"
search.pattern <-  c("is is")
exact = FALSE
range <- 20
all.pre = "no"
test <- ConcR(pathname, search.pattern, range=20, exact = FALSE, all.pre = FALSE)
head(test[[1]])
 
#>search.pattern   id text.id  previous element(s)   token  following element(s)
#>1  1   text1                      This is  the first sentence i
#>2  2   text1 of the test corpus.  This is  a second sentence in
#>3  3   text2                      This is  a second file with s
#>4  4   text3            Finally,  this is  the last file of the
#>5  5   text3 ce I am quite lazy,  this is  the last sentence in
#>                                                  as one
#>1                     &lt;&lt; This is &gt;&gt; the first sentence i
#>2 of the test corpus. &lt;&lt; This is &gt;&gt; a second sentence in
#>3                     &lt;&lt; This is &gt;&gt; a second file with s
#>4            Finally, &lt;&lt; this is &gt;&gt; the last file of the
#>5 ce I am quite lazy, &lt;&lt; this is &gt;&gt; the last sentence in
###############################################################

I hope that I was able to show you how you can use R as an alternative for other concordance applications.

Schreibe einen Kommentar

Deine E-Mail-Adresse wird nicht veröffentlicht. Erforderliche Felder sind mit * markiert.