blob: 2be8801c76b22c5db318167da9856f887bf1378e [file] [log] [blame]
package main
import (
"bufio"
"encoding/json"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"math/rand"
"net/http"
"net/url"
"os"
"strconv"
"strings"
"time"
)
const DefaultBL = "boq_groupsfrontendserver_20220224.07_p0"
// IDs for API requests:
const RPCIDListConversations = "Dq0xse" // /GroupsFrontendConversationService.ListConversations
const RPCIDListConversationMessages = "H08Fi" // /GroupsFrontendConversationService.ListConversationMessages
var (
group = flag.String("group", "", "Email of the group you want to export.")
getList = flag.Bool("getList", false, "Get a list of threads and write it to the file specified in --file (one of --getList or --getThreads is required).")
getThreads = flag.Bool("getThreads", false, "Retrieve all the threads specified in the thread list passed via STDIN (one of --getList or --getThreads is required).")
fileName = flag.String("file", "threads.txt", "File where thread IDs will be written when running with --getList.")
folderName = flag.String("folder", "threads", "Folder where threads will be saved when running with --getThreads.")
authenticated = flag.Bool("authenticated", false, "Whether you want to take out the forum with authentication.")
cookies = flag.String("cookies", "", "Cookies (if you want to take out the forum authenticated).")
fsid = flag.String("fsid", "", "f.sid value (if you want to take out the forum authenticated).")
at = flag.String("at", "", "at value (if you want to take out the forum authenticated).")
realCookies []*http.Cookie
reqId = rand.Intn(999999)
)
type Request struct {
Rpc string // RPC ID
Request string // Request encoded as a string
}
type Response struct {
Rpc string // RPC ID
Data string // Data
Index string // Order index (can be a number encoded as a string or "generic")
Ok bool // Whether the request finshed successfully and the data is thus filled
}
type ConversationListResponse struct {
PaginationToken string // Next page token
IDs []string // List with thread IDs
}
type ConversationMessagesResponse struct {
PaginationToken string // Next page token
Data string // Thread data encoded as PB+JSON
}
func batchRequest(requests *[]Request) (*[]Response, error) {
var requestsArray [][]interface{}
var RPCIdsSlice []string
for i, r := range *requests {
requestArray := make([]interface{}, 4)
requestArray[0] = r.Rpc
requestArray[1] = r.Request
requestArray[2] = nil
requestArray[3] = strconv.Itoa(i + 1)
requestsArray = append(requestsArray, requestArray)
RPCIdsSlice = append(RPCIdsSlice, r.Rpc)
}
freq, err := json.Marshal(requestsArray)
if err != nil {
return nil, err
}
freqString := "[" + string(freq) + "]"
v := url.Values{}
v.Set("f.req", freqString)
if *authenticated {
v.Set("at", *at)
}
RPCIds := url.QueryEscape(strings.Join(RPCIdsSlice, ","))
reqUrl := "https://groups.google.com/_/GroupsFrontendUi/data/batchexecute?rpcids=" + RPCIds + "&bl=" + DefaultBL + "&hl=en&_reqid=" + strconv.Itoa(reqId)
if *authenticated {
reqUrl += "&f.sid=" + url.QueryEscape(*fsid)
}
req, err := http.NewRequest("POST", reqUrl, strings.NewReader(v.Encode()))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
if *authenticated {
for _, c := range realCookies {
req.AddCookie(c)
}
}
c := &http.Client{}
resp, err := c.Do(req)
reqId += 100000
if err != nil {
return nil, err
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf("Status code is %v", resp.StatusCode)
}
var respBody [][]interface{}
io.CopyN(ioutil.Discard, resp.Body, 6) // Discard first 6 bytes
err = json.NewDecoder(resp.Body).Decode(&respBody)
if err != nil {
return nil, err
}
var responses []Response
for _, r := range respBody {
if len(r) < 7 || r[0] != "wrb.fr" {
continue
}
rpc, ok1 := r[1].(string)
data, ok2 := r[2].(string)
index, ok3 := r[6].(string)
if !ok1 {
return nil, fmt.Errorf("Couldn't parse the response (expected a string with the rpc ID).")
}
var response Response
if !ok2 || !ok3 {
response = Response{
Rpc: rpc,
Ok: false,
}
} else {
response = Response{
Rpc: rpc,
Data: data,
Index: index,
Ok: true,
}
}
responses = append(responses, response)
}
return &responses, nil
}
func getConversations(group string, paginationToken string, num int) (*ConversationListResponse, error) {
request := make([]interface{}, 3)
request[0] = group
request[1] = num
request[2] = paginationToken
reqText, err := json.Marshal(request)
if err != nil {
return nil, err
}
requests := []Request{
Request{
Rpc: RPCIDListConversations,
Request: string(reqText),
},
}
resp, err := batchRequest(&requests)
if err != nil {
return nil, fmt.Errorf("An error occurred while requesting the conversation list: %v\n", err)
}
for _, r := range *resp {
if r.Rpc == RPCIDListConversations {
if !r.Ok {
return nil, fmt.Errorf("The server didn't fulfill the request successfully (maybe you don't have permission to view the group?)")
}
var body []interface{}
err = json.Unmarshal([]byte(r.Data), &body)
if err != nil {
return nil, fmt.Errorf("While parsing conversation list response: %v", err)
}
if len(body) < 3 {
return nil, fmt.Errorf("While parsing conversation list response: body isn't long enough")
}
var resp ConversationListResponse
// Retrieve thread IDs
var IDs []string
threads, ok := body[2].([]interface{})
if !ok {
return nil, fmt.Errorf("The conversation list response doesn't comply with the protobuf model we have seen (body[2] should be an array).")
}
for _, t := range threads {
ta, ok := t.([]interface{})
if !ok {
return nil, fmt.Errorf("The conversation list response doesn't comply with the protobuf model we have seen (body[2][i] should be an array).")
}
if len(ta) < 1 {
return nil, fmt.Errorf("While parsing conversation list response: thread isn't long enough")
}
info, ok := ta[0].([]interface{})
if !ok {
return nil, fmt.Errorf("The conversation list response doesn't comply with the protobuf model we have seen (body[2][i] should be an array).")
}
if len(info) < 2 {
return nil, fmt.Errorf("While parsing conversation list response: thread info isn't long enough")
}
threadId, ok := info[1].(string)
if !ok {
return nil, fmt.Errorf("The conversation list response doesn't comply with the protobuf model we have seen (body[2][i][0][1] should be a string).")
}
IDs = append(IDs, threadId)
}
resp.IDs = IDs
// Retrieve pagination token
if len(body) >= 4 {
paginationToken, ok := body[3].(string)
if ok {
resp.PaginationToken = paginationToken
}
}
return &resp, nil
}
}
return nil, fmt.Errorf("The server didn't return the conversations list correctly, or we couldn't find it.")
}
func getAllConversations(group string) (*[]string, error) {
paginationToken := ""
totalRetrieved := 0
var IDs []string
for {
resp, err := getConversations(group, paginationToken, 50)
if err != nil {
return nil, err
}
totalRetrieved += len(resp.IDs)
log.Printf("Retrieved %v posts (total: %v)...\n", len(resp.IDs), totalRetrieved)
IDs = append(IDs, resp.IDs...)
if resp.PaginationToken == "" {
break
}
paginationToken = resp.PaginationToken
time.Sleep(time.Second) // Sleep for a second to prevent overwhelming the server
}
return &IDs, nil
}
func getConversation(group string, id string, paginationToken string, num int) (*ConversationMessagesResponse, error) {
request := make([]interface{}, 4)
request[0] = group
request[1] = id
if paginationToken == "" {
request[2] = num
request[3] = nil
request = append(request, nil, 2)
} else {
request[2] = nil
request[3] = paginationToken
}
reqText, err := json.Marshal(request)
if err != nil {
return nil, err
}
requests := []Request{
Request{
Rpc: RPCIDListConversationMessages,
Request: string(reqText),
},
}
resp, err := batchRequest(&requests)
if err != nil {
return nil, fmt.Errorf("An error occurred while requesting the conversation messages: %v\n", err)
}
for _, r := range *resp {
if r.Rpc == RPCIDListConversationMessages {
if !r.Ok {
return nil, fmt.Errorf("The server didn't fulfill the request successfully (maybe you don't have permission to view the group?)")
}
if r.Data == "" || r.Data == "[]" {
return nil, fmt.Errorf("No data was returned for the thread.")
}
var resp ConversationMessagesResponse
resp.Data = r.Data
// Get pagination token
var body []interface{}
err = json.Unmarshal([]byte(r.Data), &body)
if err != nil {
return nil, fmt.Errorf("While parsing conversation list response: %v", err)
}
if len(body) >= 4 {
paginationToken, ok := body[3].(string)
if ok {
resp.PaginationToken = paginationToken
}
}
return &resp, nil
}
}
return nil, fmt.Errorf("The server didn't return the conversations list correctly, or we couldn't find it.")
}
func downloadThread(group string, id string, chFailedIDs chan string, chFinishedIDs chan string) {
i := 0
paginationToken := ""
for {
resp, err := getConversation(group, id, paginationToken, 100)
if err != nil {
log.Printf("Error downloading thread %v: %v", id, err)
chFailedIDs <- id
return
}
name := id + ".json"
if i > 0 {
name = id + "_" + strconv.Itoa(i) + ".json"
}
fullName := *folderName + "/" + name
err = os.WriteFile(fullName, []byte(resp.Data), 0644)
if err != nil {
log.Printf("Error downloading thread %v: couldn't write file \"%v\": %v", id, fullName, err)
}
if resp.PaginationToken == "" {
break
}
paginationToken = resp.PaginationToken
i++
}
chFinishedIDs <- id
}
func queueDownloadIfNeeded(group string, chFailedIDs chan string, chFinishedIDs chan string, IDs *[]string, nextIndex *int) {
if *nextIndex < len(*IDs) {
time.Sleep(50 * time.Millisecond)
go downloadThread(group, (*IDs)[*nextIndex], chFailedIDs, chFinishedIDs)
*nextIndex++
}
}
func main() {
flag.Parse()
if *group == "" {
log.Fatalln("A Google Group wasn't provided via the \"--group\" flag.")
}
if (*getList && *getThreads) || (!*getList && !*getThreads) {
log.Fatalln("Please specify one of --getList or --getThreads (but not both).")
}
if *authenticated {
if *cookies == "" || *fsid == "" || *at == "" {
log.Fatalln("If you specify --authenticated, you should also specify --cookies, --fsid and --at.")
}
rawRequest := fmt.Sprintf("GET / HTTP/1.0\nCookie: %s\n\n", *cookies)
req, err := http.ReadRequest(bufio.NewReader(strings.NewReader(rawRequest)))
if err == nil {
realCookies = req.Cookies()
}
}
if *getList {
log.Printf("Getting list of thread IDs for group %s...\n", *group)
file, err := os.Create(*fileName)
if err != nil {
log.Fatalf("Couldn't create file \"%v\"", *fileName)
}
// Get a list of conversation IDs
convs, err := getAllConversations(*group)
if err != nil {
log.Fatalf("Error calling getAllConversations: %v\n", err)
}
// Save those to the file, one by line
for _, id := range *convs {
io.WriteString(file, id+"\n")
}
}
if *getThreads {
log.Printf("Starting actual takeout for group %s...\n", *group)
scanner := bufio.NewScanner(os.Stdin)
var IDs []string
for scanner.Scan() {
id := scanner.Text()
IDs = append(IDs, id)
}
log.Printf("Total: %v threads. Beginning to download them...\n", len(IDs))
chFailedIDs := make(chan string)
chFinishedIDs := make(chan string)
nextIndex := -1
for i, id := range IDs {
go downloadThread(*group, id, chFailedIDs, chFinishedIDs)
nextIndex = i
if i > 10 {
break
}
}
failedThreads := make([]string, 0)
for i := 0; i < len(IDs); i++ {
select {
case id := <-chFailedIDs:
failedThreads = append(failedThreads, id)
queueDownloadIfNeeded(*group, chFailedIDs, chFinishedIDs, &IDs, &nextIndex)
case id := <-chFinishedIDs:
log.Printf("Finished downloading thread %v successfully\n", id)
queueDownloadIfNeeded(*group, chFailedIDs, chFinishedIDs, &IDs, &nextIndex)
}
}
log.Printf("Failed threads: %v", failedThreads)
}
}