Skip to content

Commit 7958e81

Browse files
committed
1.0.5-3 sent to CRAN
1 parent afc71d3 commit 7958e81

File tree

58 files changed

+1009
-1106
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1009
-1106
lines changed

.Rbuildignore

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ ignore/*
1414
^codecov\.yml$
1515
^dev$
1616
^README\.html$
17-
^inst/examples/data\.qmd$
18-
^inst/examples/text\.qmd$
1917
^LICENSE\.md$
2018
^install\.ps1$
2119
^article$

CRAN-SUBMISSION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
Version: 1.0.5-2
2-
Date: 2024-04-29 22:54:47 UTC
3-
SHA: 6bd494634d077544f5f7c628dafc19cb5a0fb987
2+
Date: 2024-05-14 20:20:09 UTC
3+
SHA: afc71d378ab49fa29ae1c6075d499f8249a80110

DESCRIPTION

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,35 @@
11
Package: tabulapdf
22
Type: Package
33
Title: Extract Tables from PDF Documents
4-
Version: 1.0.5-2
5-
Authors@R: c(person("Thomas J.", "Leeper",
6-
role = "aut",
7-
email = "thosjleeper@gmail.com",
8-
comment = c(ORCID = "0000-0003-4097-6326")),
9-
person("Mauricio", "Vargas Sepulveda",
10-
role = c("aut","cre"),
11-
email = "m.sepulveda@mail.utoronto.ca",
12-
comment = c(ORCID = "0000-0003-1017-7574")),
13-
person("Tom", "Paskhalis",
14-
role = "aut",
15-
email = "tpaskhalis@gmail.com",
16-
comment = c(ORCID = "0000-0001-9298-8850")),
17-
person("Manuel", "Aristaran",
18-
role = "ctb"),
19-
person("David", "Gohel",
20-
role = "ctb",
21-
comment = "rOpenSci reviewer"),
22-
person("Lincoln", "Mullen",
23-
role = "ctb",
24-
comment = "rOpenSci reviewer"))
254
Description: Bindings for the 'Tabula' <https://tabula.technology/> 'Java'
26-
library, which can extract tables from PDF documents.
5+
library, which can extract tables from PDF files. This tool can reduce time
6+
and effort in data extraction processes in fields like investigative
7+
journalism. It allows for automatic and manual table extraction, the latter
8+
facilitated through a 'Shiny' interface, enabling manual areas selection\
9+
with a computer mouse for data retrieval.
10+
Version: 1.0.5-2
11+
Authors@R: c(
12+
person("Thomas J.", "Leeper",
13+
role = "aut",
14+
email = "thosjleeper@gmail.com",
15+
comment = c(ORCID = "0000-0003-4097-6326")),
16+
person("Mauricio", "Vargas Sepulveda",
17+
role = c("aut","cre"),
18+
email = "m.sepulveda@mail.utoronto.ca",
19+
comment = c(ORCID = "0000-0003-1017-7574")),
20+
person("Tom", "Paskhalis",
21+
role = "aut",
22+
email = "tpaskhalis@gmail.com",
23+
comment = c(ORCID = "0000-0001-9298-8850")),
24+
person("Manuel", "Aristaran",
25+
role = "ctb"),
26+
person("David", "Gohel",
27+
role = "ctb",
28+
comment = "rOpenSci reviewer"),
29+
person("Lincoln", "Mullen",
30+
role = "ctb",
31+
comment = "rOpenSci reviewer")
32+
)
2733
License: Apache License (>= 2)
2834
URL: https://docs.ropensci.org/tabulapdf/ (website)
2935
https://github.com/ropensci/tabulapdf/

R/extract_metadata.R

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,42 +9,40 @@
99
#' @return A list.
1010
#' @author Thomas J. Leeper <thosjleeper@gmail.com>
1111
#' @examples
12-
#' \dontrun{
1312
#' # simple demo file
14-
#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
13+
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
1514
#'
1615
#' extract_metadata(f)
17-
#' }
1816
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{extract_text}}, \code{\link{split_pdf}}
1917
#' @importFrom rJava J new
2018
#' @export
2119
extract_metadata <- function(file, password = NULL, copy = FALSE) {
22-
pdfDocument <- load_doc(file, password = password, copy = copy)
23-
on.exit(pdfDocument$close())
20+
pdfDocument <- load_doc(file, password = password, copy = copy)
21+
on.exit(pdfDocument$close())
2422

25-
info <- pdfDocument$getDocumentInformation()
23+
info <- pdfDocument$getDocumentInformation()
2624

27-
info_creation_date <- info$getCreationDate()
28-
info_modification_date <- info$getModificationDate()
25+
info_creation_date <- info$getCreationDate()
26+
info_modification_date <- info$getModificationDate()
2927

30-
if (!is.null(info_creation_date)) {
31-
info_creation_date <- info_creation_date$getTime()$toString()
32-
}
28+
if (!is.null(info_creation_date)) {
29+
info_creation_date <- info_creation_date$getTime()$toString()
30+
}
3331

34-
if (!is.null(info_modification_date)) {
35-
info_modification_date <- info_modification_date$getTime()$toString()
36-
}
32+
if (!is.null(info_modification_date)) {
33+
info_modification_date <- info_modification_date$getTime()$toString()
34+
}
3735

38-
list(
39-
pages = pdfDocument$getNumberOfPages(),
40-
title = info$getTitle(),
41-
author = info$getAuthor(),
42-
subject = info$getSubject(),
43-
keywords = info$getKeywords(),
44-
creator = info$getCreator(),
45-
producer = info$getProducer(),
46-
created = info_creation_date,
47-
modified = info_modification_date,
48-
trapped = info$getTrapped()
49-
)
36+
list(
37+
pages = pdfDocument$getNumberOfPages(),
38+
title = info$getTitle(),
39+
author = info$getAuthor(),
40+
subject = info$getSubject(),
41+
keywords = info$getKeywords(),
42+
creator = info$getCreator(),
43+
producer = info$getProducer(),
44+
created = info_creation_date,
45+
modified = info_modification_date,
46+
trapped = info$getTrapped()
47+
)
5048
}

R/extract_tables.R

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,25 +34,11 @@
3434
#' @references \href{https://tabula.technology/}{Tabula}
3535
#' @author Thomas J. Leeper <thosjleeper@gmail.com>, Tom Paskhalis <tpaskhalis@gmail.com>
3636
#' @examples
37-
#' \dontrun{
3837
#' # simple demo file
39-
#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
40-
#'
41-
#' # extract all tables
42-
#' extract_tables(f)
38+
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
4339
#'
4440
#' # extract tables from only second page
4541
#' extract_tables(f, pages = 2)
46-
#'
47-
#' # extract areas from a page
48-
#' ## full table
49-
#' extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462)))
50-
#' ## part of the table
51-
#' extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417)))
52-
#'
53-
#' # return tibbles
54-
#' extract_tables(f, pages = 2, output = "tibble")
55-
#' }
5642
#' @seealso \code{\link{extract_areas}}, \code{\link{get_page_dims}}, \code{\link{make_thumbnails}}, \code{\link{split_pdf}}
5743
#' @importFrom utils download.file
5844
#' @importFrom readr read_delim

R/extract_text.R

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,8 @@
1212
#' @return If \code{pages = NULL} (the default), a length 1 character vector, otherwise a vector of length \code{length(pages)}.
1313
#' @author Thomas J. Leeper <thosjleeper@gmail.com>
1414
#' @examples
15-
#' \dontrun{
1615
#' # simple demo file
17-
#' f <- system.file("examples", "text.pdf", package = "tabulapdf")
16+
#' f <- system.file("examples", "fortytwo.pdf", package = "tabulapdf")
1817
#'
1918
#' # extract all text
2019
#' extract_text(f)
@@ -24,7 +23,6 @@
2423
#'
2524
#' # extract text from selected area only
2625
#' extract_text(f, area = list(c(209.4, 140.5, 304.2, 500.8)))
27-
#' }
2826
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{split_pdf}}
2927
#' @importFrom rJava J new
3028
#' @export

R/get_page_dims.R

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -13,42 +13,40 @@
1313
#' @references \href{https://tabula.technology/}{Tabula}
1414
#' @author Thomas J. Leeper <thosjleeper@gmail.com>
1515
#' @examples
16-
#' \dontrun{
1716
#' # simple demo file
18-
#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
17+
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
1918
#'
2019
#' get_n_pages(file = f)
2120
#' get_page_dims(f)
22-
#' }
2321
#' @importFrom tools file_path_sans_ext
2422
#' @importFrom rJava J new
2523
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_text}}, \code{\link{make_thumbnails}}
2624
#' @export
2725
get_page_dims <- function(file, doc, pages = NULL, password = NULL, copy = FALSE) {
28-
if (!missing(file)) {
29-
doc <- load_doc(file, password = password, copy = copy)
30-
on.exit(doc$close())
31-
}
26+
if (!missing(file)) {
27+
doc <- load_doc(file, password = password, copy = copy)
28+
on.exit(doc$close())
29+
}
3230

33-
if (!is.null(pages)) {
34-
pages <- as.integer(pages)
35-
} else {
36-
pages <- 1L:(get_n_pages(doc = doc))
37-
}
31+
if (!is.null(pages)) {
32+
pages <- as.integer(pages)
33+
} else {
34+
pages <- 1L:(get_n_pages(doc = doc))
35+
}
3836

39-
allpages <- doc$getDocumentCatalog()$getPages()
40-
lapply(pages, function(x) {
41-
thispage <- allpages$get(x - 1L)
42-
c(thispage$getMediaBox()$getWidth(), thispage$getMediaBox()$getHeight())
43-
})
37+
allpages <- doc$getDocumentCatalog()$getPages()
38+
lapply(pages, function(x) {
39+
thispage <- allpages$get(x - 1L)
40+
c(thispage$getMediaBox()$getWidth(), thispage$getMediaBox()$getHeight())
41+
})
4442
}
4543

4644
#' @rdname get_page_dims
4745
#' @export
4846
get_n_pages <- function(file, doc, password = NULL, copy = FALSE) {
49-
if (!missing(file)) {
50-
doc <- load_doc(file, password = password, copy = copy)
51-
on.exit(doc$close())
52-
}
53-
doc$getNumberOfPages()
47+
if (!missing(file)) {
48+
doc <- load_doc(file, password = password, copy = copy)
49+
on.exit(doc$close())
50+
}
51+
doc$getNumberOfPages()
5452
}

0 commit comments

Comments
 (0)