blob: 6a60192b465e7e5c8f466b5984fd75e516f91ebd [file] [log] [blame]
Dusan Kasan17e497e2017-04-10 22:44:22 +02001package parsemail
2
3import (
Dusan Kasan17e497e2017-04-10 22:44:22 +02004 "bytes"
Dusan Kasan4595dfe2017-04-13 00:38:24 +02005 "encoding/base64"
Dusan Kasan4595dfe2017-04-13 00:38:24 +02006 "fmt"
7 "io"
8 "io/ioutil"
9 "mime"
10 "mime/multipart"
11 "net/mail"
12 "strings"
13 "time"
Dusan Kasan17e497e2017-04-10 22:44:22 +020014)
15
Dusan Kasan45ca2642017-04-18 10:39:35 +020016const contentTypeMultipartMixed = "multipart/mixed"
17const contentTypeMultipartAlternative = "multipart/alternative"
18const contentTypeMultipartRelated = "multipart/related"
19const contentTypeTextHtml = "text/html"
20const contentTypeTextPlain = "text/plain"
Dusan Kasan17e497e2017-04-10 22:44:22 +020021
Dusan Kasan45ca2642017-04-18 10:39:35 +020022// Parse an email message read from io.Reader into parsemail.Email struct
Dusan Kasanb49ceb62017-04-13 00:00:36 +020023func Parse(r io.Reader) (email Email, err error) {
Dusan Kasan4595dfe2017-04-13 00:38:24 +020024 msg, err := mail.ReadMessage(r)
Dusan Kasan17e497e2017-04-10 22:44:22 +020025 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020026 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020027 }
28
Dusan Kasanb49ceb62017-04-13 00:00:36 +020029 email, err = createEmailFromHeader(msg.Header)
Dusan Kasan17e497e2017-04-10 22:44:22 +020030 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020031 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020032 }
33
Dusan Kasan428369f2020-02-24 00:47:31 +010034 email.ContentType = msg.Header.Get("Content-Type")
35 contentType, params, err := parseContentType(email.ContentType)
Dusan Kasan17e497e2017-04-10 22:44:22 +020036 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020037 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020038 }
39
Dusan Kasanb49ceb62017-04-13 00:00:36 +020040 switch contentType {
Dusan Kasan45ca2642017-04-18 10:39:35 +020041 case contentTypeMultipartMixed:
Dusan Kasan17e497e2017-04-10 22:44:22 +020042 email.TextBody, email.HTMLBody, email.Attachments, email.EmbeddedFiles, err = parseMultipartMixed(msg.Body, params["boundary"])
Dusan Kasan45ca2642017-04-18 10:39:35 +020043 case contentTypeMultipartAlternative:
Dusan Kasan17e497e2017-04-10 22:44:22 +020044 email.TextBody, email.HTMLBody, email.EmbeddedFiles, err = parseMultipartAlternative(msg.Body, params["boundary"])
Obi Symons89230f42020-04-04 14:32:11 +110045 case contentTypeMultipartRelated:
46 email.TextBody, email.HTMLBody, email.EmbeddedFiles, err = parseMultipartRelated(msg.Body, params["boundary"])
Dusan Kasan45ca2642017-04-18 10:39:35 +020047 case contentTypeTextPlain:
Dusan Kasan17e497e2017-04-10 22:44:22 +020048 message, _ := ioutil.ReadAll(msg.Body)
49 email.TextBody = strings.TrimSuffix(string(message[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +020050 case contentTypeTextHtml:
Dusan Kasan17e497e2017-04-10 22:44:22 +020051 message, _ := ioutil.ReadAll(msg.Body)
52 email.HTMLBody = strings.TrimSuffix(string(message[:]), "\n")
Dusan Kasanb49ceb62017-04-13 00:00:36 +020053 default:
Dusan Kasan428369f2020-02-24 00:47:31 +010054 email.Content, err = decodeContent(msg.Body, msg.Header.Get("Content-Transfer-Encoding"))
Dusan Kasan17e497e2017-04-10 22:44:22 +020055 }
56
Dusan Kasanb49ceb62017-04-13 00:00:36 +020057 return
58}
59
60func createEmailFromHeader(header mail.Header) (email Email, err error) {
Dusan Kasane668cf22017-04-18 12:56:51 +020061 hp := headerParser{header: &header}
62
Dusan Kasanf4376a62017-05-23 21:03:55 +020063 email.Subject = decodeMimeSentence(header.Get("Subject"))
Dusan Kasane668cf22017-04-18 12:56:51 +020064 email.From = hp.parseAddressList(header.Get("From"))
65 email.Sender = hp.parseAddress(header.Get("Sender"))
66 email.ReplyTo = hp.parseAddressList(header.Get("Reply-To"))
67 email.To = hp.parseAddressList(header.Get("To"))
68 email.Cc = hp.parseAddressList(header.Get("Cc"))
69 email.Bcc = hp.parseAddressList(header.Get("Bcc"))
70 email.Date = hp.parseTime(header.Get("Date"))
71 email.ResentFrom = hp.parseAddressList(header.Get("Resent-From"))
72 email.ResentSender = hp.parseAddress(header.Get("Resent-Sender"))
73 email.ResentTo = hp.parseAddressList(header.Get("Resent-To"))
74 email.ResentCc = hp.parseAddressList(header.Get("Resent-Cc"))
75 email.ResentBcc = hp.parseAddressList(header.Get("Resent-Bcc"))
76 email.ResentMessageID = hp.parseMessageId(header.Get("Resent-Message-ID"))
77 email.MessageID = hp.parseMessageId(header.Get("Message-ID"))
78 email.InReplyTo = hp.parseMessageIdList(header.Get("In-Reply-To"))
79 email.References = hp.parseMessageIdList(header.Get("References"))
80 email.ResentDate = hp.parseTime(header.Get("Resent-Date"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +020081
Dusan Kasane668cf22017-04-18 12:56:51 +020082 if hp.err != nil {
83 err = hp.err
Dusan Kasanb49ceb62017-04-13 00:00:36 +020084 return
85 }
86
Dusan Kasanb49ceb62017-04-13 00:00:36 +020087 //decode whole header for easier access to extra fields
88 //todo: should we decode? aren't only standard fields mime encoded?
89 email.Header, err = decodeHeaderMime(header)
90 if err != nil {
91 return
92 }
93
94 return
95}
96
97func parseContentType(contentTypeHeader string) (contentType string, params map[string]string, err error) {
98 if contentTypeHeader == "" {
Dusan Kasan45ca2642017-04-18 10:39:35 +020099 contentType = contentTypeTextPlain
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200100 return
101 }
102
103 return mime.ParseMediaType(contentTypeHeader)
104}
105
Kevin Chen5dc5bc82018-05-03 22:07:35 -0400106func parseMultipartRelated(msg io.Reader, boundary string) (textBody, htmlBody string, embeddedFiles []EmbeddedFile, err error) {
107 pmr := multipart.NewReader(msg, boundary)
108 for {
109 part, err := pmr.NextPart()
110
111 if err == io.EOF {
112 break
113 } else if err != nil {
114 return textBody, htmlBody, embeddedFiles, err
115 }
116
117 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
118 if err != nil {
119 return textBody, htmlBody, embeddedFiles, err
120 }
121
122 switch contentType {
123 case contentTypeTextPlain:
124 ppContent, err := ioutil.ReadAll(part)
125 if err != nil {
126 return textBody, htmlBody, embeddedFiles, err
127 }
128
129 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
130 case contentTypeTextHtml:
131 ppContent, err := ioutil.ReadAll(part)
132 if err != nil {
133 return textBody, htmlBody, embeddedFiles, err
134 }
135
136 htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
137 case contentTypeMultipartAlternative:
138 tb, hb, ef, err := parseMultipartAlternative(part, params["boundary"])
139 if err != nil {
140 return textBody, htmlBody, embeddedFiles, err
141 }
142
143 htmlBody += hb
144 textBody += tb
145 embeddedFiles = append(embeddedFiles, ef...)
146 default:
147 if isEmbeddedFile(part) {
148 ef, err := decodeEmbeddedFile(part)
149 if err != nil {
150 return textBody, htmlBody, embeddedFiles, err
151 }
152
153 embeddedFiles = append(embeddedFiles, ef)
154 } else {
155 return textBody, htmlBody, embeddedFiles, fmt.Errorf("Can't process multipart/related inner mime type: %s", contentType)
156 }
157 }
158 }
159
160 return textBody, htmlBody, embeddedFiles, err
161}
162
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200163func parseMultipartAlternative(msg io.Reader, boundary string) (textBody, htmlBody string, embeddedFiles []EmbeddedFile, err error) {
164 pmr := multipart.NewReader(msg, boundary)
165 for {
166 part, err := pmr.NextPart()
167
168 if err == io.EOF {
169 break
170 } else if err != nil {
171 return textBody, htmlBody, embeddedFiles, err
172 }
173
174 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
Dusan Kasanc661cc02017-04-18 10:51:51 +0200175 if err != nil {
176 return textBody, htmlBody, embeddedFiles, err
177 }
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200178
179 switch contentType {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200180 case contentTypeTextPlain:
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200181 ppContent, err := ioutil.ReadAll(part)
182 if err != nil {
183 return textBody, htmlBody, embeddedFiles, err
184 }
185
186 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +0200187 case contentTypeTextHtml:
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200188 ppContent, err := ioutil.ReadAll(part)
189 if err != nil {
190 return textBody, htmlBody, embeddedFiles, err
191 }
192
193 htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +0200194 case contentTypeMultipartRelated:
Kevin Chen9b9506a2018-05-03 22:17:38 -0400195 tb, hb, ef, err := parseMultipartRelated(part, params["boundary"])
Dusan Kasan1a966482017-04-18 10:45:25 +0200196 if err != nil {
197 return textBody, htmlBody, embeddedFiles, err
198 }
199
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200200 htmlBody += hb
201 textBody += tb
202 embeddedFiles = append(embeddedFiles, ef...)
203 default:
204 if isEmbeddedFile(part) {
205 ef, err := decodeEmbeddedFile(part)
206 if err != nil {
207 return textBody, htmlBody, embeddedFiles, err
208 }
209
210 embeddedFiles = append(embeddedFiles, ef)
211 } else {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200212 return textBody, htmlBody, embeddedFiles, fmt.Errorf("Can't process multipart/alternative inner mime type: %s", contentType)
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200213 }
214 }
215 }
216
217 return textBody, htmlBody, embeddedFiles, err
218}
219
220func parseMultipartMixed(msg io.Reader, boundary string) (textBody, htmlBody string, attachments []Attachment, embeddedFiles []EmbeddedFile, err error) {
221 mr := multipart.NewReader(msg, boundary)
222 for {
223 part, err := mr.NextPart()
224 if err == io.EOF {
225 break
226 } else if err != nil {
227 return textBody, htmlBody, attachments, embeddedFiles, err
228 }
229
230 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
231 if err != nil {
232 return textBody, htmlBody, attachments, embeddedFiles, err
233 }
234
Dusan Kasan45ca2642017-04-18 10:39:35 +0200235 if contentType == contentTypeMultipartAlternative {
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200236 textBody, htmlBody, embeddedFiles, err = parseMultipartAlternative(part, params["boundary"])
237 if err != nil {
238 return textBody, htmlBody, attachments, embeddedFiles, err
239 }
Kevin Chen5dc5bc82018-05-03 22:07:35 -0400240 } else if contentType == contentTypeMultipartRelated {
241 textBody, htmlBody, embeddedFiles, err = parseMultipartRelated(part, params["boundary"])
242 if err != nil {
243 return textBody, htmlBody, attachments, embeddedFiles, err
244 }
Maya Rashisha3803bd2019-06-08 17:53:21 +0300245 } else if contentType == contentTypeTextPlain {
246 ppContent, err := ioutil.ReadAll(part)
247 if err != nil {
248 return textBody, htmlBody, attachments, embeddedFiles, err
249 }
250
251 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
k-yomo2e670d92020-04-30 22:42:48 +0900252 } else if contentType == contentTypeTextHtml {
253 ppContent, err := ioutil.ReadAll(part)
254 if err != nil {
255 return textBody, htmlBody, attachments, embeddedFiles, err
256 }
257
258 htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200259 } else if isAttachment(part) {
260 at, err := decodeAttachment(part)
261 if err != nil {
262 return textBody, htmlBody, attachments, embeddedFiles, err
263 }
264
265 attachments = append(attachments, at)
266 } else {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200267 return textBody, htmlBody, attachments, embeddedFiles, fmt.Errorf("Unknown multipart/mixed nested mime type: %s", contentType)
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200268 }
269 }
270
271 return textBody, htmlBody, attachments, embeddedFiles, err
Dusan Kasan17e497e2017-04-10 22:44:22 +0200272}
273
Dusan Kasanf4376a62017-05-23 21:03:55 +0200274func decodeMimeSentence(s string) string {
Dusan Kasan17e497e2017-04-10 22:44:22 +0200275 result := []string{}
276 ss := strings.Split(s, " ")
277
278 for _, word := range ss {
279 dec := new(mime.WordDecoder)
280 w, err := dec.Decode(word)
281 if err != nil {
282 if len(result) == 0 {
283 w = word
284 } else {
285 w = " " + word
286 }
287 }
288
289 result = append(result, w)
290 }
291
Dusan Kasanf4376a62017-05-23 21:03:55 +0200292 return strings.Join(result, "")
Dusan Kasan17e497e2017-04-10 22:44:22 +0200293}
294
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200295func decodeHeaderMime(header mail.Header) (mail.Header, error) {
Dusan Kasan17e497e2017-04-10 22:44:22 +0200296 parsedHeader := map[string][]string{}
297
298 for headerName, headerData := range header {
299
300 parsedHeaderData := []string{}
301 for _, headerValue := range headerData {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200302 parsedHeaderData = append(parsedHeaderData, decodeMimeSentence(headerValue))
Dusan Kasan17e497e2017-04-10 22:44:22 +0200303 }
304
305 parsedHeader[headerName] = parsedHeaderData
306 }
307
308 return mail.Header(parsedHeader), nil
309}
310
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200311func isEmbeddedFile(part *multipart.Part) bool {
312 return part.Header.Get("Content-Transfer-Encoding") != ""
313}
314
315func decodeEmbeddedFile(part *multipart.Part) (ef EmbeddedFile, err error) {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200316 cid := decodeMimeSentence(part.Header.Get("Content-Id"))
Dusan Kasan428369f2020-02-24 00:47:31 +0100317 decoded, err := decodeContent(part, part.Header.Get("Content-Transfer-Encoding"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200318 if err != nil {
319 return
320 }
321
322 ef.CID = strings.Trim(cid, "<>")
323 ef.Data = decoded
324 ef.ContentType = part.Header.Get("Content-Type")
325
326 return
327}
328
329func isAttachment(part *multipart.Part) bool {
330 return part.FileName() != ""
331}
332
333func decodeAttachment(part *multipart.Part) (at Attachment, err error) {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200334 filename := decodeMimeSentence(part.FileName())
Dusan Kasan428369f2020-02-24 00:47:31 +0100335 decoded, err := decodeContent(part, part.Header.Get("Content-Transfer-Encoding"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200336 if err != nil {
337 return
338 }
339
340 at.Filename = filename
341 at.Data = decoded
342 at.ContentType = strings.Split(part.Header.Get("Content-Type"), ";")[0]
343
344 return
345}
346
Dusan Kasan428369f2020-02-24 00:47:31 +0100347func decodeContent(content io.Reader, encoding string) (io.Reader, error) {
348 switch encoding {
349 case "base64":
350 decoded := base64.NewDecoder(base64.StdEncoding, content)
351 b, err := ioutil.ReadAll(decoded)
352 if err != nil {
353 return nil, err
354 }
355
356 return bytes.NewReader(b), nil
Dusan Kasan3325e732020-04-04 11:46:04 +0200357 case "7bit":
358 dd, err := ioutil.ReadAll(content)
359 if err != nil {
360 return nil, err
361 }
362
363 return bytes.NewReader(dd), nil
Dusan Kasan428369f2020-02-24 00:47:31 +0100364 case "":
365 return content, nil
366 default:
367 return nil, fmt.Errorf("unknown encoding: %s", encoding)
368 }
369}
370
Dusan Kasane668cf22017-04-18 12:56:51 +0200371type headerParser struct {
372 header *mail.Header
Dusan Kasanb974c632017-04-18 12:58:42 +0200373 err error
Dusan Kasane668cf22017-04-18 12:56:51 +0200374}
375
376func (hp headerParser) parseAddress(s string) (ma *mail.Address) {
377 if hp.err != nil {
378 return nil
379 }
380
381 if strings.Trim(s, " \n") != "" {
382 ma, hp.err = mail.ParseAddress(s)
383
384 return ma
385 }
386
387 return nil
388}
389
390func (hp headerParser) parseAddressList(s string) (ma []*mail.Address) {
391 if hp.err != nil {
392 return
393 }
394
395 if strings.Trim(s, " \n") != "" {
396 ma, hp.err = mail.ParseAddressList(s)
397 return
398 }
399
400 return
401}
402
403func (hp headerParser) parseTime(s string) (t time.Time) {
Dusan Kasanb974c632017-04-18 12:58:42 +0200404 if hp.err != nil || s == "" {
Dusan Kasane668cf22017-04-18 12:56:51 +0200405 return
406 }
407
Dusan Kasan88226cf2020-04-04 11:13:06 +0200408 formats := []string{
409 time.RFC1123Z,
410 "Mon, 2 Jan 2006 15:04:05 -0700",
411 time.RFC1123Z + " (MST)",
412 "Mon, 2 Jan 2006 15:04:05 -0700 (MST)",
Dusan Kasane668cf22017-04-18 12:56:51 +0200413 }
414
Dusan Kasan88226cf2020-04-04 11:13:06 +0200415 for _, format := range formats {
416 t, hp.err = time.Parse(format, s)
417 if hp.err == nil {
418 return
419 }
420 }
Dusan Kasane668cf22017-04-18 12:56:51 +0200421
422 return
423}
424
425func (hp headerParser) parseMessageId(s string) string {
426 if hp.err != nil {
427 return ""
428 }
429
430 return strings.Trim(s, "<> ")
431}
432
433func (hp headerParser) parseMessageIdList(s string) (result []string) {
434 if hp.err != nil {
435 return
436 }
437
438 for _, p := range strings.Split(s, " ") {
439 if strings.Trim(p, " \n") != "" {
440 result = append(result, hp.parseMessageId(p))
441 }
442 }
443
444 return
445}
446
Dusan Kasan1a966482017-04-18 10:45:25 +0200447// Attachment with filename, content type and data (as a io.Reader)
Dusan Kasan17e497e2017-04-10 22:44:22 +0200448type Attachment struct {
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200449 Filename string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200450 ContentType string
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200451 Data io.Reader
Dusan Kasan17e497e2017-04-10 22:44:22 +0200452}
453
Dusan Kasan1a966482017-04-18 10:45:25 +0200454// EmbeddedFile with content id, content type and data (as a io.Reader)
Dusan Kasan17e497e2017-04-10 22:44:22 +0200455type EmbeddedFile struct {
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200456 CID string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200457 ContentType string
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200458 Data io.Reader
Dusan Kasan17e497e2017-04-10 22:44:22 +0200459}
460
Dusan Kasan1a966482017-04-18 10:45:25 +0200461// Email with fields for all the headers defined in RFC5322 with it's attachments and
Dusan Kasan17e497e2017-04-10 22:44:22 +0200462type Email struct {
463 Header mail.Header
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200464
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200465 Subject string
466 Sender *mail.Address
467 From []*mail.Address
468 ReplyTo []*mail.Address
469 To []*mail.Address
470 Cc []*mail.Address
471 Bcc []*mail.Address
472 Date time.Time
473 MessageID string
474 InReplyTo []string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200475 References []string
476
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200477 ResentFrom []*mail.Address
478 ResentSender *mail.Address
479 ResentTo []*mail.Address
480 ResentDate time.Time
481 ResentCc []*mail.Address
482 ResentBcc []*mail.Address
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200483 ResentMessageID string
484
Dusan Kasan428369f2020-02-24 00:47:31 +0100485 ContentType string
486 Content io.Reader
487
Dusan Kasan17e497e2017-04-10 22:44:22 +0200488 HTMLBody string
489 TextBody string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200490
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200491 Attachments []Attachment
Dusan Kasan17e497e2017-04-10 22:44:22 +0200492 EmbeddedFiles []EmbeddedFile
Dusan Kasan428369f2020-02-24 00:47:31 +0100493}