Adrià Vilanova MartÃnez | 81a703d | 2022-03-04 01:03:01 +0100 | [diff] [blame] | 1 | package main |
| 2 | |
| 3 | import ( |
| 4 | "bufio" |
| 5 | "encoding/json" |
| 6 | "flag" |
| 7 | "fmt" |
| 8 | "io" |
| 9 | "io/ioutil" |
| 10 | "log" |
| 11 | "math/rand" |
| 12 | "net/http" |
| 13 | "net/url" |
| 14 | "os" |
| 15 | "strconv" |
| 16 | "strings" |
| 17 | "time" |
| 18 | ) |
| 19 | |
| 20 | const DefaultBL = "boq_groupsfrontendserver_20220224.07_p0" |
| 21 | |
| 22 | // IDs for API requests: |
| 23 | const RPCIDListConversations = "Dq0xse" // /GroupsFrontendConversationService.ListConversations |
| 24 | const RPCIDListConversationMessages = "H08Fi" // /GroupsFrontendConversationService.ListConversationMessages |
| 25 | |
| 26 | var ( |
| 27 | group = flag.String("group", "", "Email of the group you want to export.") |
| 28 | getList = flag.Bool("getList", false, "Get a list of threads and write it to the file specified in --file (one of --getList or --getThreads is required).") |
| 29 | getThreads = flag.Bool("getThreads", false, "Retrieve all the threads specified in the thread list passed via STDIN (one of --getList or --getThreads is required).") |
| 30 | fileName = flag.String("file", "threads.txt", "File where thread IDs will be written when running with --getList.") |
| 31 | folderName = flag.String("folder", "threads", "Folder where threads will be saved when running with --getThreads.") |
| 32 | authenticated = flag.Bool("authenticated", false, "Whether you want to take out the forum with authentication.") |
| 33 | cookies = flag.String("cookies", "", "Cookies (if you want to take out the forum authenticated).") |
| 34 | fsid = flag.String("fsid", "", "f.sid value (if you want to take out the forum authenticated).") |
| 35 | at = flag.String("at", "", "at value (if you want to take out the forum authenticated).") |
| 36 | realCookies []*http.Cookie |
| 37 | |
| 38 | reqId = rand.Intn(999999) |
| 39 | ) |
| 40 | |
| 41 | type Request struct { |
| 42 | Rpc string // RPC ID |
| 43 | Request string // Request encoded as a string |
| 44 | } |
| 45 | |
| 46 | type Response struct { |
| 47 | Rpc string // RPC ID |
| 48 | Data string // Data |
| 49 | Index string // Order index (can be a number encoded as a string or "generic") |
| 50 | Ok bool // Whether the request finshed successfully and the data is thus filled |
| 51 | } |
| 52 | |
| 53 | type ConversationListResponse struct { |
| 54 | PaginationToken string // Next page token |
| 55 | IDs []string // List with thread IDs |
| 56 | } |
| 57 | |
| 58 | type ConversationMessagesResponse struct { |
| 59 | PaginationToken string // Next page token |
| 60 | Data string // Thread data encoded as PB+JSON |
| 61 | } |
| 62 | |
| 63 | func batchRequest(requests *[]Request) (*[]Response, error) { |
| 64 | var requestsArray [][]interface{} |
| 65 | var RPCIdsSlice []string |
| 66 | for i, r := range *requests { |
| 67 | requestArray := make([]interface{}, 4) |
| 68 | requestArray[0] = r.Rpc |
| 69 | requestArray[1] = r.Request |
| 70 | requestArray[2] = nil |
| 71 | requestArray[3] = strconv.Itoa(i + 1) |
| 72 | requestsArray = append(requestsArray, requestArray) |
| 73 | RPCIdsSlice = append(RPCIdsSlice, r.Rpc) |
| 74 | } |
| 75 | freq, err := json.Marshal(requestsArray) |
| 76 | if err != nil { |
| 77 | return nil, err |
| 78 | } |
| 79 | freqString := "[" + string(freq) + "]" |
| 80 | |
| 81 | v := url.Values{} |
| 82 | v.Set("f.req", freqString) |
| 83 | if *authenticated { |
| 84 | v.Set("at", *at) |
| 85 | } |
| 86 | RPCIds := url.QueryEscape(strings.Join(RPCIdsSlice, ",")) |
| 87 | reqUrl := "https://groups.google.com/_/GroupsFrontendUi/data/batchexecute?rpcids=" + RPCIds + "&bl=" + DefaultBL + "&hl=en&_reqid=" + strconv.Itoa(reqId) |
| 88 | if *authenticated { |
| 89 | reqUrl += "&f.sid=" + url.QueryEscape(*fsid) |
| 90 | } |
| 91 | |
| 92 | req, err := http.NewRequest("POST", reqUrl, strings.NewReader(v.Encode())) |
| 93 | if err != nil { |
| 94 | return nil, err |
| 95 | } |
| 96 | req.Header.Set("Content-Type", "application/x-www-form-urlencoded") |
| 97 | if *authenticated { |
| 98 | for _, c := range realCookies { |
| 99 | req.AddCookie(c) |
| 100 | } |
| 101 | } |
| 102 | c := &http.Client{} |
| 103 | resp, err := c.Do(req) |
| 104 | reqId += 100000 |
| 105 | if err != nil { |
| 106 | return nil, err |
| 107 | } |
| 108 | |
| 109 | if resp.StatusCode != 200 { |
| 110 | return nil, fmt.Errorf("Status code is %v", resp.StatusCode) |
| 111 | } |
| 112 | |
| 113 | var respBody [][]interface{} |
| 114 | io.CopyN(ioutil.Discard, resp.Body, 6) // Discard first 6 bytes |
| 115 | err = json.NewDecoder(resp.Body).Decode(&respBody) |
| 116 | if err != nil { |
| 117 | return nil, err |
| 118 | } |
| 119 | |
| 120 | var responses []Response |
| 121 | for _, r := range respBody { |
| 122 | if len(r) < 7 || r[0] != "wrb.fr" { |
| 123 | continue |
| 124 | } |
| 125 | |
| 126 | rpc, ok1 := r[1].(string) |
| 127 | data, ok2 := r[2].(string) |
| 128 | index, ok3 := r[6].(string) |
| 129 | if !ok1 { |
| 130 | return nil, fmt.Errorf("Couldn't parse the response (expected a string with the rpc ID).") |
| 131 | } |
| 132 | |
| 133 | var response Response |
| 134 | if !ok2 || !ok3 { |
| 135 | response = Response{ |
| 136 | Rpc: rpc, |
| 137 | Ok: false, |
| 138 | } |
| 139 | } else { |
| 140 | response = Response{ |
| 141 | Rpc: rpc, |
| 142 | Data: data, |
| 143 | Index: index, |
| 144 | Ok: true, |
| 145 | } |
| 146 | } |
| 147 | responses = append(responses, response) |
| 148 | } |
| 149 | |
| 150 | return &responses, nil |
| 151 | } |
| 152 | |
| 153 | func getConversations(group string, paginationToken string, num int) (*ConversationListResponse, error) { |
| 154 | request := make([]interface{}, 3) |
| 155 | request[0] = group |
| 156 | request[1] = num |
| 157 | request[2] = paginationToken |
| 158 | reqText, err := json.Marshal(request) |
| 159 | if err != nil { |
| 160 | return nil, err |
| 161 | } |
| 162 | |
| 163 | requests := []Request{ |
| 164 | Request{ |
| 165 | Rpc: RPCIDListConversations, |
| 166 | Request: string(reqText), |
| 167 | }, |
| 168 | } |
| 169 | |
| 170 | resp, err := batchRequest(&requests) |
| 171 | if err != nil { |
| 172 | return nil, fmt.Errorf("An error occurred while requesting the conversation list: %v\n", err) |
| 173 | } |
| 174 | |
| 175 | for _, r := range *resp { |
| 176 | if r.Rpc == RPCIDListConversations { |
| 177 | if !r.Ok { |
| 178 | return nil, fmt.Errorf("The server didn't fulfill the request successfully (maybe you don't have permission to view the group?)") |
| 179 | } |
| 180 | |
| 181 | var body []interface{} |
| 182 | err = json.Unmarshal([]byte(r.Data), &body) |
| 183 | if err != nil { |
| 184 | return nil, fmt.Errorf("While parsing conversation list response: %v", err) |
| 185 | } |
| 186 | if len(body) < 3 { |
| 187 | return nil, fmt.Errorf("While parsing conversation list response: body isn't long enough") |
| 188 | } |
| 189 | |
| 190 | var resp ConversationListResponse |
| 191 | |
| 192 | // Retrieve thread IDs |
| 193 | var IDs []string |
| 194 | threads, ok := body[2].([]interface{}) |
| 195 | if !ok { |
| 196 | return nil, fmt.Errorf("The conversation list response doesn't comply with the protobuf model we have seen (body[2] should be an array).") |
| 197 | } |
| 198 | for _, t := range threads { |
| 199 | ta, ok := t.([]interface{}) |
| 200 | if !ok { |
| 201 | return nil, fmt.Errorf("The conversation list response doesn't comply with the protobuf model we have seen (body[2][i] should be an array).") |
| 202 | } |
| 203 | if len(ta) < 1 { |
| 204 | return nil, fmt.Errorf("While parsing conversation list response: thread isn't long enough") |
| 205 | } |
| 206 | info, ok := ta[0].([]interface{}) |
| 207 | if !ok { |
| 208 | return nil, fmt.Errorf("The conversation list response doesn't comply with the protobuf model we have seen (body[2][i] should be an array).") |
| 209 | } |
| 210 | if len(info) < 2 { |
| 211 | return nil, fmt.Errorf("While parsing conversation list response: thread info isn't long enough") |
| 212 | } |
| 213 | threadId, ok := info[1].(string) |
| 214 | if !ok { |
| 215 | return nil, fmt.Errorf("The conversation list response doesn't comply with the protobuf model we have seen (body[2][i][0][1] should be a string).") |
| 216 | } |
| 217 | IDs = append(IDs, threadId) |
| 218 | } |
| 219 | resp.IDs = IDs |
| 220 | |
| 221 | // Retrieve pagination token |
| 222 | if len(body) >= 4 { |
| 223 | paginationToken, ok := body[3].(string) |
| 224 | if ok { |
| 225 | resp.PaginationToken = paginationToken |
| 226 | } |
| 227 | } |
| 228 | |
| 229 | return &resp, nil |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | return nil, fmt.Errorf("The server didn't return the conversations list correctly, or we couldn't find it.") |
| 234 | } |
| 235 | |
| 236 | func getAllConversations(group string) (*[]string, error) { |
| 237 | paginationToken := "" |
| 238 | totalRetrieved := 0 |
| 239 | var IDs []string |
| 240 | for { |
| 241 | resp, err := getConversations(group, paginationToken, 50) |
| 242 | if err != nil { |
| 243 | return nil, err |
| 244 | } |
| 245 | totalRetrieved += len(resp.IDs) |
| 246 | log.Printf("Retrieved %v posts (total: %v)...\n", len(resp.IDs), totalRetrieved) |
| 247 | |
| 248 | IDs = append(IDs, resp.IDs...) |
| 249 | |
| 250 | if resp.PaginationToken == "" { |
| 251 | break |
| 252 | } |
| 253 | paginationToken = resp.PaginationToken |
| 254 | time.Sleep(time.Second) // Sleep for a second to prevent overwhelming the server |
| 255 | } |
| 256 | return &IDs, nil |
| 257 | } |
| 258 | |
| 259 | func getConversation(group string, id string, paginationToken string, num int) (*ConversationMessagesResponse, error) { |
| 260 | request := make([]interface{}, 4) |
| 261 | request[0] = group |
| 262 | request[1] = id |
| 263 | if paginationToken == "" { |
| 264 | request[2] = num |
| 265 | request[3] = nil |
| 266 | request = append(request, nil, 2) |
| 267 | } else { |
| 268 | request[2] = nil |
| 269 | request[3] = paginationToken |
| 270 | } |
| 271 | reqText, err := json.Marshal(request) |
| 272 | if err != nil { |
| 273 | return nil, err |
| 274 | } |
| 275 | |
| 276 | requests := []Request{ |
| 277 | Request{ |
| 278 | Rpc: RPCIDListConversationMessages, |
| 279 | Request: string(reqText), |
| 280 | }, |
| 281 | } |
| 282 | |
| 283 | resp, err := batchRequest(&requests) |
| 284 | if err != nil { |
| 285 | return nil, fmt.Errorf("An error occurred while requesting the conversation messages: %v\n", err) |
| 286 | } |
| 287 | |
| 288 | for _, r := range *resp { |
| 289 | if r.Rpc == RPCIDListConversationMessages { |
| 290 | if !r.Ok { |
| 291 | return nil, fmt.Errorf("The server didn't fulfill the request successfully (maybe you don't have permission to view the group?)") |
| 292 | } |
| 293 | |
| 294 | if r.Data == "" || r.Data == "[]" { |
| 295 | return nil, fmt.Errorf("No data was returned for the thread.") |
| 296 | } |
| 297 | |
| 298 | var resp ConversationMessagesResponse |
| 299 | resp.Data = r.Data |
| 300 | |
| 301 | // Get pagination token |
| 302 | var body []interface{} |
| 303 | err = json.Unmarshal([]byte(r.Data), &body) |
| 304 | if err != nil { |
| 305 | return nil, fmt.Errorf("While parsing conversation list response: %v", err) |
| 306 | } |
| 307 | |
| 308 | if len(body) >= 4 { |
| 309 | paginationToken, ok := body[3].(string) |
| 310 | if ok { |
| 311 | resp.PaginationToken = paginationToken |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | return &resp, nil |
| 316 | } |
| 317 | } |
| 318 | |
| 319 | return nil, fmt.Errorf("The server didn't return the conversations list correctly, or we couldn't find it.") |
| 320 | } |
| 321 | |
| 322 | func downloadThread(group string, id string, chFailedIDs chan string, chFinishedIDs chan string) { |
| 323 | i := 0 |
| 324 | paginationToken := "" |
| 325 | for { |
| 326 | resp, err := getConversation(group, id, paginationToken, 100) |
| 327 | if err != nil { |
| 328 | log.Printf("Error downloading thread %v: %v", id, err) |
| 329 | chFailedIDs <- id |
| 330 | return |
| 331 | } |
| 332 | |
| 333 | name := id + ".json" |
| 334 | if i > 0 { |
| 335 | name = id + "_" + strconv.Itoa(i) + ".json" |
| 336 | } |
| 337 | fullName := *folderName + "/" + name |
| 338 | |
| 339 | err = os.WriteFile(fullName, []byte(resp.Data), 0644) |
| 340 | if err != nil { |
| 341 | log.Printf("Error downloading thread %v: couldn't write file \"%v\": %v", id, fullName, err) |
| 342 | } |
| 343 | |
| 344 | if resp.PaginationToken == "" { |
| 345 | break |
| 346 | } |
| 347 | paginationToken = resp.PaginationToken |
| 348 | i++ |
| 349 | } |
| 350 | |
| 351 | chFinishedIDs <- id |
| 352 | } |
| 353 | |
| 354 | func queueDownloadIfNeeded(group string, chFailedIDs chan string, chFinishedIDs chan string, IDs *[]string, nextIndex *int) { |
| 355 | if *nextIndex < len(*IDs) { |
| 356 | time.Sleep(50 * time.Millisecond) |
| 357 | go downloadThread(group, (*IDs)[*nextIndex], chFailedIDs, chFinishedIDs) |
| 358 | *nextIndex++ |
| 359 | } |
| 360 | } |
| 361 | |
| 362 | func main() { |
| 363 | flag.Parse() |
| 364 | if *group == "" { |
| 365 | log.Fatalln("A Google Group wasn't provided via the \"--group\" flag.") |
| 366 | } |
| 367 | |
| 368 | if (*getList && *getThreads) || (!*getList && !*getThreads) { |
| 369 | log.Fatalln("Please specify one of --getList or --getThreads (but not both).") |
| 370 | } |
| 371 | |
| 372 | if *authenticated { |
| 373 | if *cookies == "" || *fsid == "" || *at == "" { |
| 374 | log.Fatalln("If you specify --authenticated, you should also specify --cookies, --fsid and --at.") |
| 375 | } |
| 376 | |
| 377 | rawRequest := fmt.Sprintf("GET / HTTP/1.0\nCookie: %s\n\n", *cookies) |
| 378 | req, err := http.ReadRequest(bufio.NewReader(strings.NewReader(rawRequest))) |
| 379 | if err == nil { |
| 380 | realCookies = req.Cookies() |
| 381 | } |
| 382 | } |
| 383 | |
| 384 | if *getList { |
| 385 | log.Printf("Getting list of thread IDs for group %s...\n", *group) |
| 386 | |
| 387 | file, err := os.Create(*fileName) |
| 388 | if err != nil { |
| 389 | log.Fatalf("Couldn't create file \"%v\"", *fileName) |
| 390 | } |
| 391 | |
| 392 | // Get a list of conversation IDs |
| 393 | convs, err := getAllConversations(*group) |
| 394 | if err != nil { |
| 395 | log.Fatalf("Error calling getAllConversations: %v\n", err) |
| 396 | } |
| 397 | |
| 398 | // Save those to the file, one by line |
| 399 | for _, id := range *convs { |
| 400 | io.WriteString(file, id+"\n") |
| 401 | } |
| 402 | } |
| 403 | |
| 404 | if *getThreads { |
| 405 | log.Printf("Starting actual takeout for group %s...\n", *group) |
| 406 | scanner := bufio.NewScanner(os.Stdin) |
| 407 | var IDs []string |
| 408 | for scanner.Scan() { |
| 409 | id := scanner.Text() |
| 410 | IDs = append(IDs, id) |
| 411 | } |
| 412 | log.Printf("Total: %v threads. Beginning to download them...\n", len(IDs)) |
| 413 | |
| 414 | chFailedIDs := make(chan string) |
| 415 | chFinishedIDs := make(chan string) |
| 416 | |
| 417 | nextIndex := -1 |
| 418 | for i, id := range IDs { |
| 419 | go downloadThread(*group, id, chFailedIDs, chFinishedIDs) |
| 420 | nextIndex = i |
| 421 | if i > 10 { |
| 422 | break |
| 423 | } |
| 424 | } |
| 425 | |
| 426 | failedThreads := make([]string, 0) |
| 427 | for i := 0; i < len(IDs); i++ { |
| 428 | select { |
| 429 | case id := <-chFailedIDs: |
| 430 | failedThreads = append(failedThreads, id) |
| 431 | queueDownloadIfNeeded(*group, chFailedIDs, chFinishedIDs, &IDs, &nextIndex) |
| 432 | case id := <-chFinishedIDs: |
| 433 | log.Printf("Finished downloading thread %v successfully\n", id) |
| 434 | queueDownloadIfNeeded(*group, chFailedIDs, chFinishedIDs, &IDs, &nextIndex) |
| 435 | } |
| 436 | } |
| 437 | |
| 438 | log.Printf("Failed threads: %v", failedThreads) |
| 439 | } |
| 440 | } |