blob: 3011f9657c283511765732fc0ba8e4e3a332c251 [file] [log] [blame]
Dusan Kasan17e497e2017-04-10 22:44:22 +02001package parsemail
2
3import (
Dusan Kasan17e497e2017-04-10 22:44:22 +02004 "bytes"
Dusan Kasan4595dfe2017-04-13 00:38:24 +02005 "encoding/base64"
Dusan Kasan4595dfe2017-04-13 00:38:24 +02006 "fmt"
7 "io"
8 "io/ioutil"
9 "mime"
10 "mime/multipart"
11 "net/mail"
12 "strings"
13 "time"
Dusan Kasan17e497e2017-04-10 22:44:22 +020014)
15
Dusan Kasan45ca2642017-04-18 10:39:35 +020016const contentTypeMultipartMixed = "multipart/mixed"
17const contentTypeMultipartAlternative = "multipart/alternative"
18const contentTypeMultipartRelated = "multipart/related"
19const contentTypeTextHtml = "text/html"
20const contentTypeTextPlain = "text/plain"
Dusan Kasan17e497e2017-04-10 22:44:22 +020021
Dusan Kasan45ca2642017-04-18 10:39:35 +020022// Parse an email message read from io.Reader into parsemail.Email struct
Dusan Kasanb49ceb62017-04-13 00:00:36 +020023func Parse(r io.Reader) (email Email, err error) {
Dusan Kasan4595dfe2017-04-13 00:38:24 +020024 msg, err := mail.ReadMessage(r)
Dusan Kasan17e497e2017-04-10 22:44:22 +020025 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020026 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020027 }
28
Dusan Kasanb49ceb62017-04-13 00:00:36 +020029 email, err = createEmailFromHeader(msg.Header)
Dusan Kasan17e497e2017-04-10 22:44:22 +020030 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020031 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020032 }
33
Dusan Kasan428369f2020-02-24 00:47:31 +010034 email.ContentType = msg.Header.Get("Content-Type")
35 contentType, params, err := parseContentType(email.ContentType)
Dusan Kasan17e497e2017-04-10 22:44:22 +020036 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020037 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020038 }
39
Dusan Kasan428369f2020-02-24 00:47:31 +010040
Dusan Kasanb49ceb62017-04-13 00:00:36 +020041 switch contentType {
Dusan Kasan45ca2642017-04-18 10:39:35 +020042 case contentTypeMultipartMixed:
Dusan Kasan17e497e2017-04-10 22:44:22 +020043 email.TextBody, email.HTMLBody, email.Attachments, email.EmbeddedFiles, err = parseMultipartMixed(msg.Body, params["boundary"])
Dusan Kasan45ca2642017-04-18 10:39:35 +020044 case contentTypeMultipartAlternative:
Dusan Kasan17e497e2017-04-10 22:44:22 +020045 email.TextBody, email.HTMLBody, email.EmbeddedFiles, err = parseMultipartAlternative(msg.Body, params["boundary"])
Dusan Kasan45ca2642017-04-18 10:39:35 +020046 case contentTypeTextPlain:
Dusan Kasan17e497e2017-04-10 22:44:22 +020047 message, _ := ioutil.ReadAll(msg.Body)
48 email.TextBody = strings.TrimSuffix(string(message[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +020049 case contentTypeTextHtml:
Dusan Kasan17e497e2017-04-10 22:44:22 +020050 message, _ := ioutil.ReadAll(msg.Body)
51 email.HTMLBody = strings.TrimSuffix(string(message[:]), "\n")
Dusan Kasanb49ceb62017-04-13 00:00:36 +020052 default:
Dusan Kasan428369f2020-02-24 00:47:31 +010053 email.Content, err = decodeContent(msg.Body, msg.Header.Get("Content-Transfer-Encoding"))
Dusan Kasan17e497e2017-04-10 22:44:22 +020054 }
55
Dusan Kasanb49ceb62017-04-13 00:00:36 +020056 return
57}
58
59func createEmailFromHeader(header mail.Header) (email Email, err error) {
Dusan Kasane668cf22017-04-18 12:56:51 +020060 hp := headerParser{header: &header}
61
Dusan Kasanf4376a62017-05-23 21:03:55 +020062 email.Subject = decodeMimeSentence(header.Get("Subject"))
Dusan Kasane668cf22017-04-18 12:56:51 +020063 email.From = hp.parseAddressList(header.Get("From"))
64 email.Sender = hp.parseAddress(header.Get("Sender"))
65 email.ReplyTo = hp.parseAddressList(header.Get("Reply-To"))
66 email.To = hp.parseAddressList(header.Get("To"))
67 email.Cc = hp.parseAddressList(header.Get("Cc"))
68 email.Bcc = hp.parseAddressList(header.Get("Bcc"))
69 email.Date = hp.parseTime(header.Get("Date"))
70 email.ResentFrom = hp.parseAddressList(header.Get("Resent-From"))
71 email.ResentSender = hp.parseAddress(header.Get("Resent-Sender"))
72 email.ResentTo = hp.parseAddressList(header.Get("Resent-To"))
73 email.ResentCc = hp.parseAddressList(header.Get("Resent-Cc"))
74 email.ResentBcc = hp.parseAddressList(header.Get("Resent-Bcc"))
75 email.ResentMessageID = hp.parseMessageId(header.Get("Resent-Message-ID"))
76 email.MessageID = hp.parseMessageId(header.Get("Message-ID"))
77 email.InReplyTo = hp.parseMessageIdList(header.Get("In-Reply-To"))
78 email.References = hp.parseMessageIdList(header.Get("References"))
79 email.ResentDate = hp.parseTime(header.Get("Resent-Date"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +020080
Dusan Kasane668cf22017-04-18 12:56:51 +020081 if hp.err != nil {
82 err = hp.err
Dusan Kasanb49ceb62017-04-13 00:00:36 +020083 return
84 }
85
Dusan Kasanb49ceb62017-04-13 00:00:36 +020086 //decode whole header for easier access to extra fields
87 //todo: should we decode? aren't only standard fields mime encoded?
88 email.Header, err = decodeHeaderMime(header)
89 if err != nil {
90 return
91 }
92
93 return
94}
95
96func parseContentType(contentTypeHeader string) (contentType string, params map[string]string, err error) {
97 if contentTypeHeader == "" {
Dusan Kasan45ca2642017-04-18 10:39:35 +020098 contentType = contentTypeTextPlain
Dusan Kasanb49ceb62017-04-13 00:00:36 +020099 return
100 }
101
102 return mime.ParseMediaType(contentTypeHeader)
103}
104
Kevin Chen5dc5bc82018-05-03 22:07:35 -0400105func parseMultipartRelated(msg io.Reader, boundary string) (textBody, htmlBody string, embeddedFiles []EmbeddedFile, err error) {
106 pmr := multipart.NewReader(msg, boundary)
107 for {
108 part, err := pmr.NextPart()
109
110 if err == io.EOF {
111 break
112 } else if err != nil {
113 return textBody, htmlBody, embeddedFiles, err
114 }
115
116 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
117 if err != nil {
118 return textBody, htmlBody, embeddedFiles, err
119 }
120
121 switch contentType {
122 case contentTypeTextPlain:
123 ppContent, err := ioutil.ReadAll(part)
124 if err != nil {
125 return textBody, htmlBody, embeddedFiles, err
126 }
127
128 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
129 case contentTypeTextHtml:
130 ppContent, err := ioutil.ReadAll(part)
131 if err != nil {
132 return textBody, htmlBody, embeddedFiles, err
133 }
134
135 htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
136 case contentTypeMultipartAlternative:
137 tb, hb, ef, err := parseMultipartAlternative(part, params["boundary"])
138 if err != nil {
139 return textBody, htmlBody, embeddedFiles, err
140 }
141
142 htmlBody += hb
143 textBody += tb
144 embeddedFiles = append(embeddedFiles, ef...)
145 default:
146 if isEmbeddedFile(part) {
147 ef, err := decodeEmbeddedFile(part)
148 if err != nil {
149 return textBody, htmlBody, embeddedFiles, err
150 }
151
152 embeddedFiles = append(embeddedFiles, ef)
153 } else {
154 return textBody, htmlBody, embeddedFiles, fmt.Errorf("Can't process multipart/related inner mime type: %s", contentType)
155 }
156 }
157 }
158
159 return textBody, htmlBody, embeddedFiles, err
160}
161
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200162func parseMultipartAlternative(msg io.Reader, boundary string) (textBody, htmlBody string, embeddedFiles []EmbeddedFile, err error) {
163 pmr := multipart.NewReader(msg, boundary)
164 for {
165 part, err := pmr.NextPart()
166
167 if err == io.EOF {
168 break
169 } else if err != nil {
170 return textBody, htmlBody, embeddedFiles, err
171 }
172
173 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
Dusan Kasanc661cc02017-04-18 10:51:51 +0200174 if err != nil {
175 return textBody, htmlBody, embeddedFiles, err
176 }
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200177
178 switch contentType {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200179 case contentTypeTextPlain:
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200180 ppContent, err := ioutil.ReadAll(part)
181 if err != nil {
182 return textBody, htmlBody, embeddedFiles, err
183 }
184
185 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +0200186 case contentTypeTextHtml:
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200187 ppContent, err := ioutil.ReadAll(part)
188 if err != nil {
189 return textBody, htmlBody, embeddedFiles, err
190 }
191
192 htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +0200193 case contentTypeMultipartRelated:
Kevin Chen9b9506a2018-05-03 22:17:38 -0400194 tb, hb, ef, err := parseMultipartRelated(part, params["boundary"])
Dusan Kasan1a966482017-04-18 10:45:25 +0200195 if err != nil {
196 return textBody, htmlBody, embeddedFiles, err
197 }
198
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200199 htmlBody += hb
200 textBody += tb
201 embeddedFiles = append(embeddedFiles, ef...)
202 default:
203 if isEmbeddedFile(part) {
204 ef, err := decodeEmbeddedFile(part)
205 if err != nil {
206 return textBody, htmlBody, embeddedFiles, err
207 }
208
209 embeddedFiles = append(embeddedFiles, ef)
210 } else {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200211 return textBody, htmlBody, embeddedFiles, fmt.Errorf("Can't process multipart/alternative inner mime type: %s", contentType)
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200212 }
213 }
214 }
215
216 return textBody, htmlBody, embeddedFiles, err
217}
218
219func parseMultipartMixed(msg io.Reader, boundary string) (textBody, htmlBody string, attachments []Attachment, embeddedFiles []EmbeddedFile, err error) {
220 mr := multipart.NewReader(msg, boundary)
221 for {
222 part, err := mr.NextPart()
223 if err == io.EOF {
224 break
225 } else if err != nil {
226 return textBody, htmlBody, attachments, embeddedFiles, err
227 }
228
229 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
230 if err != nil {
231 return textBody, htmlBody, attachments, embeddedFiles, err
232 }
233
Dusan Kasan45ca2642017-04-18 10:39:35 +0200234 if contentType == contentTypeMultipartAlternative {
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200235 textBody, htmlBody, embeddedFiles, err = parseMultipartAlternative(part, params["boundary"])
236 if err != nil {
237 return textBody, htmlBody, attachments, embeddedFiles, err
238 }
Kevin Chen5dc5bc82018-05-03 22:07:35 -0400239 } else if contentType == contentTypeMultipartRelated {
240 textBody, htmlBody, embeddedFiles, err = parseMultipartRelated(part, params["boundary"])
241 if err != nil {
242 return textBody, htmlBody, attachments, embeddedFiles, err
243 }
Maya Rashisha3803bd2019-06-08 17:53:21 +0300244 } else if contentType == contentTypeTextPlain {
245 ppContent, err := ioutil.ReadAll(part)
246 if err != nil {
247 return textBody, htmlBody, attachments, embeddedFiles, err
248 }
249
250 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200251 } else if isAttachment(part) {
252 at, err := decodeAttachment(part)
253 if err != nil {
254 return textBody, htmlBody, attachments, embeddedFiles, err
255 }
256
257 attachments = append(attachments, at)
258 } else {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200259 return textBody, htmlBody, attachments, embeddedFiles, fmt.Errorf("Unknown multipart/mixed nested mime type: %s", contentType)
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200260 }
261 }
262
263 return textBody, htmlBody, attachments, embeddedFiles, err
Dusan Kasan17e497e2017-04-10 22:44:22 +0200264}
265
Dusan Kasanf4376a62017-05-23 21:03:55 +0200266func decodeMimeSentence(s string) string {
Dusan Kasan17e497e2017-04-10 22:44:22 +0200267 result := []string{}
268 ss := strings.Split(s, " ")
269
270 for _, word := range ss {
271 dec := new(mime.WordDecoder)
272 w, err := dec.Decode(word)
273 if err != nil {
274 if len(result) == 0 {
275 w = word
276 } else {
277 w = " " + word
278 }
279 }
280
281 result = append(result, w)
282 }
283
Dusan Kasanf4376a62017-05-23 21:03:55 +0200284 return strings.Join(result, "")
Dusan Kasan17e497e2017-04-10 22:44:22 +0200285}
286
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200287func decodeHeaderMime(header mail.Header) (mail.Header, error) {
Dusan Kasan17e497e2017-04-10 22:44:22 +0200288 parsedHeader := map[string][]string{}
289
290 for headerName, headerData := range header {
291
292 parsedHeaderData := []string{}
293 for _, headerValue := range headerData {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200294 parsedHeaderData = append(parsedHeaderData, decodeMimeSentence(headerValue))
Dusan Kasan17e497e2017-04-10 22:44:22 +0200295 }
296
297 parsedHeader[headerName] = parsedHeaderData
298 }
299
300 return mail.Header(parsedHeader), nil
301}
302
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200303func isEmbeddedFile(part *multipart.Part) bool {
304 return part.Header.Get("Content-Transfer-Encoding") != ""
305}
306
307func decodeEmbeddedFile(part *multipart.Part) (ef EmbeddedFile, err error) {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200308 cid := decodeMimeSentence(part.Header.Get("Content-Id"))
Dusan Kasan428369f2020-02-24 00:47:31 +0100309 decoded, err := decodeContent(part, part.Header.Get("Content-Transfer-Encoding"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200310 if err != nil {
311 return
312 }
313
314 ef.CID = strings.Trim(cid, "<>")
315 ef.Data = decoded
316 ef.ContentType = part.Header.Get("Content-Type")
317
318 return
319}
320
321func isAttachment(part *multipart.Part) bool {
322 return part.FileName() != ""
323}
324
325func decodeAttachment(part *multipart.Part) (at Attachment, err error) {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200326 filename := decodeMimeSentence(part.FileName())
Dusan Kasan428369f2020-02-24 00:47:31 +0100327 decoded, err := decodeContent(part, part.Header.Get("Content-Transfer-Encoding"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200328 if err != nil {
329 return
330 }
331
332 at.Filename = filename
333 at.Data = decoded
334 at.ContentType = strings.Split(part.Header.Get("Content-Type"), ";")[0]
335
336 return
337}
338
Dusan Kasan428369f2020-02-24 00:47:31 +0100339func decodeContent(content io.Reader, encoding string) (io.Reader, error) {
340 switch encoding {
341 case "base64":
342 decoded := base64.NewDecoder(base64.StdEncoding, content)
343 b, err := ioutil.ReadAll(decoded)
344 if err != nil {
345 return nil, err
346 }
347
348 return bytes.NewReader(b), nil
349 case "":
350 return content, nil
351 default:
352 return nil, fmt.Errorf("unknown encoding: %s", encoding)
353 }
354}
355
Dusan Kasane668cf22017-04-18 12:56:51 +0200356type headerParser struct {
357 header *mail.Header
Dusan Kasanb974c632017-04-18 12:58:42 +0200358 err error
Dusan Kasane668cf22017-04-18 12:56:51 +0200359}
360
361func (hp headerParser) parseAddress(s string) (ma *mail.Address) {
362 if hp.err != nil {
363 return nil
364 }
365
366 if strings.Trim(s, " \n") != "" {
367 ma, hp.err = mail.ParseAddress(s)
368
369 return ma
370 }
371
372 return nil
373}
374
375func (hp headerParser) parseAddressList(s string) (ma []*mail.Address) {
376 if hp.err != nil {
377 return
378 }
379
380 if strings.Trim(s, " \n") != "" {
381 ma, hp.err = mail.ParseAddressList(s)
382 return
383 }
384
385 return
386}
387
388func (hp headerParser) parseTime(s string) (t time.Time) {
Dusan Kasanb974c632017-04-18 12:58:42 +0200389 if hp.err != nil || s == "" {
Dusan Kasane668cf22017-04-18 12:56:51 +0200390 return
391 }
392
Dusan Kasan88226cf2020-04-04 11:13:06 +0200393 formats := []string{
394 time.RFC1123Z,
395 "Mon, 2 Jan 2006 15:04:05 -0700",
396 time.RFC1123Z + " (MST)",
397 "Mon, 2 Jan 2006 15:04:05 -0700 (MST)",
Dusan Kasane668cf22017-04-18 12:56:51 +0200398 }
399
Dusan Kasan88226cf2020-04-04 11:13:06 +0200400 for _, format := range formats {
401 t, hp.err = time.Parse(format, s)
402 if hp.err == nil {
403 return
404 }
405 }
Dusan Kasane668cf22017-04-18 12:56:51 +0200406
407 return
408}
409
410func (hp headerParser) parseMessageId(s string) string {
411 if hp.err != nil {
412 return ""
413 }
414
415 return strings.Trim(s, "<> ")
416}
417
418func (hp headerParser) parseMessageIdList(s string) (result []string) {
419 if hp.err != nil {
420 return
421 }
422
423 for _, p := range strings.Split(s, " ") {
424 if strings.Trim(p, " \n") != "" {
425 result = append(result, hp.parseMessageId(p))
426 }
427 }
428
429 return
430}
431
Dusan Kasan1a966482017-04-18 10:45:25 +0200432// Attachment with filename, content type and data (as a io.Reader)
Dusan Kasan17e497e2017-04-10 22:44:22 +0200433type Attachment struct {
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200434 Filename string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200435 ContentType string
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200436 Data io.Reader
Dusan Kasan17e497e2017-04-10 22:44:22 +0200437}
438
Dusan Kasan1a966482017-04-18 10:45:25 +0200439// EmbeddedFile with content id, content type and data (as a io.Reader)
Dusan Kasan17e497e2017-04-10 22:44:22 +0200440type EmbeddedFile struct {
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200441 CID string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200442 ContentType string
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200443 Data io.Reader
Dusan Kasan17e497e2017-04-10 22:44:22 +0200444}
445
Dusan Kasan1a966482017-04-18 10:45:25 +0200446// Email with fields for all the headers defined in RFC5322 with it's attachments and
Dusan Kasan17e497e2017-04-10 22:44:22 +0200447type Email struct {
448 Header mail.Header
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200449
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200450 Subject string
451 Sender *mail.Address
452 From []*mail.Address
453 ReplyTo []*mail.Address
454 To []*mail.Address
455 Cc []*mail.Address
456 Bcc []*mail.Address
457 Date time.Time
458 MessageID string
459 InReplyTo []string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200460 References []string
461
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200462 ResentFrom []*mail.Address
463 ResentSender *mail.Address
464 ResentTo []*mail.Address
465 ResentDate time.Time
466 ResentCc []*mail.Address
467 ResentBcc []*mail.Address
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200468 ResentMessageID string
469
Dusan Kasan428369f2020-02-24 00:47:31 +0100470 ContentType string
471 Content io.Reader
472
Dusan Kasan17e497e2017-04-10 22:44:22 +0200473 HTMLBody string
474 TextBody string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200475
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200476 Attachments []Attachment
Dusan Kasan17e497e2017-04-10 22:44:22 +0200477 EmbeddedFiles []EmbeddedFile
Dusan Kasan428369f2020-02-24 00:47:31 +0100478}