Download and Parse Github Issues

# devtools::install("cscheid/rgithub")
library("github")
Error in library("github"): there is no package called 'github'
library("dplyr")
library("stringr")
library("lubridate")
library("readr")
ctx = interactive.login(Sys.getenv("GH_CLIENT_ID"),Sys.getenv("GH_CLIENT_SECRET"))

Thanks to @realAkhmed for this one, see: https://github.com/cscheid/rgithub/issues/30#issuecomment-150354560

auto.page <- function(f) {
  f_call <- substitute(f)
  stopifnot(is.call(f_call))

  i <- 1
  req <- list()
  result_lst <- list()

  repeat {
    # Specify the page to download
    f_call$page <- i

    req <- eval(f_call, parent.frame())

    # Last page has empty content
    if (length(req$content)<=0) break

    result_lst[[i]] <- req$content
    i <- i+1
  }

  result_req <- req
  result_req$content <- unlist(result_lst, recursive = FALSE)

  (result_req)
}
issues <- auto.page(github::get.all.repository.issues.comments("ropensci", "RNeXML", ctx=ctx))
length(issues$content)

Here we get the content of interest.

to_df <- function(entry){
        dplyr::data_frame(
             issue = stringr::str_replace(entry$issue_url, ".*/(.*$)", "\\1"), 
             comment_id = entry$id, 
             user = entry$user$login, 
             created_at = lubridate::parse_date_time(entry$created_at,"%Y-%m-%d %H:%M:%S"),
             updated_at = lubridate::parse_date_time(entry$updated_at,"%Y-%m-%d %H:%M:%S"),
             body = entry$body)
}

Minor problem: This doesn’t actually include the issue’s title and opening comment (or tags, status, or other metadata that all come from the issues endpoint):

issue_meta <- auto.page(github::get.repository.issues("ropensci", "RNeXML", state="all", filter="all", ctx=ctx))
meta_to_df <- function(entry){
        dplyr::data_frame(
             issue = stringr::str_replace(entry$html_url, ".*/(.*$)", "\\1"), 
             comment_id = entry$id, 
             user = entry$user$login, 
             state = entry$state,
             comments = entry$comments,
             created_at = lubridate::parse_date_time(entry$created_at,"%Y-%m-%d %H:%M:%S"),
             updated_at = lubridate::parse_date_time(entry$updated_at,"%Y-%m-%d %H:%M:%S"),
             title = entry$title,
             body = entry$body)
}
df <- dplyr::bind_rows(lapply(issues$content, to_df))
meta_df <- dplyr::bind_rows(lapply(issue_meta$content, meta_to_df))
issue_tbl <- dplyr::full_join(df, meta_df)
readr::write_csv(issue_tbl, "../_data/rnexml.csv")

Now let’s do EML

issues <- auto.page(github::get.all.repository.issues.comments("ropensci", "EML", ctx=ctx))
issue_meta <- auto.page(github::get.repository.issues("ropensci", "EML", state="all", filter="all", ctx=ctx))
df <- dplyr::bind_rows(lapply(issues$content, to_df))
meta_df <- dplyr::bind_rows(lapply(issue_meta$content, meta_to_df))
issue_tbl <- dplyr::full_join(df, meta_df)

readr::write_csv(issue_tbl, "../_data/eml.csv")