blob: a8f84e8a8e033187a849f4ff4d05e8ac9f3ee195 [file] [log] [blame]
Dusan Kasan17e497e2017-04-10 22:44:22 +02001package parsemail
2
3import (
Dusan Kasan17e497e2017-04-10 22:44:22 +02004 "bytes"
Dusan Kasan4595dfe2017-04-13 00:38:24 +02005 "encoding/base64"
Dusan Kasan4595dfe2017-04-13 00:38:24 +02006 "fmt"
7 "io"
8 "io/ioutil"
9 "mime"
10 "mime/multipart"
11 "net/mail"
12 "strings"
13 "time"
Dusan Kasan17e497e2017-04-10 22:44:22 +020014)
15
Dusan Kasan45ca2642017-04-18 10:39:35 +020016const contentTypeMultipartMixed = "multipart/mixed"
17const contentTypeMultipartAlternative = "multipart/alternative"
18const contentTypeMultipartRelated = "multipart/related"
19const contentTypeTextHtml = "text/html"
20const contentTypeTextPlain = "text/plain"
Dusan Kasan17e497e2017-04-10 22:44:22 +020021
Dusan Kasan45ca2642017-04-18 10:39:35 +020022// Parse an email message read from io.Reader into parsemail.Email struct
Dusan Kasanb49ceb62017-04-13 00:00:36 +020023func Parse(r io.Reader) (email Email, err error) {
Dusan Kasan4595dfe2017-04-13 00:38:24 +020024 msg, err := mail.ReadMessage(r)
Dusan Kasan17e497e2017-04-10 22:44:22 +020025 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020026 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020027 }
28
Dusan Kasanb49ceb62017-04-13 00:00:36 +020029 email, err = createEmailFromHeader(msg.Header)
Dusan Kasan17e497e2017-04-10 22:44:22 +020030 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020031 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020032 }
33
Dusan Kasan428369f2020-02-24 00:47:31 +010034 email.ContentType = msg.Header.Get("Content-Type")
35 contentType, params, err := parseContentType(email.ContentType)
Dusan Kasan17e497e2017-04-10 22:44:22 +020036 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020037 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020038 }
39
Dusan Kasanb49ceb62017-04-13 00:00:36 +020040 switch contentType {
Dusan Kasan45ca2642017-04-18 10:39:35 +020041 case contentTypeMultipartMixed:
Dusan Kasan17e497e2017-04-10 22:44:22 +020042 email.TextBody, email.HTMLBody, email.Attachments, email.EmbeddedFiles, err = parseMultipartMixed(msg.Body, params["boundary"])
Dusan Kasan45ca2642017-04-18 10:39:35 +020043 case contentTypeMultipartAlternative:
Dusan Kasan17e497e2017-04-10 22:44:22 +020044 email.TextBody, email.HTMLBody, email.EmbeddedFiles, err = parseMultipartAlternative(msg.Body, params["boundary"])
Dusan Kasan45ca2642017-04-18 10:39:35 +020045 case contentTypeTextPlain:
Dusan Kasan17e497e2017-04-10 22:44:22 +020046 message, _ := ioutil.ReadAll(msg.Body)
47 email.TextBody = strings.TrimSuffix(string(message[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +020048 case contentTypeTextHtml:
Dusan Kasan17e497e2017-04-10 22:44:22 +020049 message, _ := ioutil.ReadAll(msg.Body)
50 email.HTMLBody = strings.TrimSuffix(string(message[:]), "\n")
Dusan Kasanb49ceb62017-04-13 00:00:36 +020051 default:
Dusan Kasan428369f2020-02-24 00:47:31 +010052 email.Content, err = decodeContent(msg.Body, msg.Header.Get("Content-Transfer-Encoding"))
Dusan Kasan17e497e2017-04-10 22:44:22 +020053 }
54
Dusan Kasanb49ceb62017-04-13 00:00:36 +020055 return
56}
57
58func createEmailFromHeader(header mail.Header) (email Email, err error) {
Dusan Kasane668cf22017-04-18 12:56:51 +020059 hp := headerParser{header: &header}
60
Dusan Kasanf4376a62017-05-23 21:03:55 +020061 email.Subject = decodeMimeSentence(header.Get("Subject"))
Dusan Kasane668cf22017-04-18 12:56:51 +020062 email.From = hp.parseAddressList(header.Get("From"))
63 email.Sender = hp.parseAddress(header.Get("Sender"))
64 email.ReplyTo = hp.parseAddressList(header.Get("Reply-To"))
65 email.To = hp.parseAddressList(header.Get("To"))
66 email.Cc = hp.parseAddressList(header.Get("Cc"))
67 email.Bcc = hp.parseAddressList(header.Get("Bcc"))
68 email.Date = hp.parseTime(header.Get("Date"))
69 email.ResentFrom = hp.parseAddressList(header.Get("Resent-From"))
70 email.ResentSender = hp.parseAddress(header.Get("Resent-Sender"))
71 email.ResentTo = hp.parseAddressList(header.Get("Resent-To"))
72 email.ResentCc = hp.parseAddressList(header.Get("Resent-Cc"))
73 email.ResentBcc = hp.parseAddressList(header.Get("Resent-Bcc"))
74 email.ResentMessageID = hp.parseMessageId(header.Get("Resent-Message-ID"))
75 email.MessageID = hp.parseMessageId(header.Get("Message-ID"))
76 email.InReplyTo = hp.parseMessageIdList(header.Get("In-Reply-To"))
77 email.References = hp.parseMessageIdList(header.Get("References"))
78 email.ResentDate = hp.parseTime(header.Get("Resent-Date"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +020079
Dusan Kasane668cf22017-04-18 12:56:51 +020080 if hp.err != nil {
81 err = hp.err
Dusan Kasanb49ceb62017-04-13 00:00:36 +020082 return
83 }
84
Dusan Kasanb49ceb62017-04-13 00:00:36 +020085 //decode whole header for easier access to extra fields
86 //todo: should we decode? aren't only standard fields mime encoded?
87 email.Header, err = decodeHeaderMime(header)
88 if err != nil {
89 return
90 }
91
92 return
93}
94
95func parseContentType(contentTypeHeader string) (contentType string, params map[string]string, err error) {
96 if contentTypeHeader == "" {
Dusan Kasan45ca2642017-04-18 10:39:35 +020097 contentType = contentTypeTextPlain
Dusan Kasanb49ceb62017-04-13 00:00:36 +020098 return
99 }
100
101 return mime.ParseMediaType(contentTypeHeader)
102}
103
Kevin Chen5dc5bc82018-05-03 22:07:35 -0400104func parseMultipartRelated(msg io.Reader, boundary string) (textBody, htmlBody string, embeddedFiles []EmbeddedFile, err error) {
105 pmr := multipart.NewReader(msg, boundary)
106 for {
107 part, err := pmr.NextPart()
108
109 if err == io.EOF {
110 break
111 } else if err != nil {
112 return textBody, htmlBody, embeddedFiles, err
113 }
114
115 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
116 if err != nil {
117 return textBody, htmlBody, embeddedFiles, err
118 }
119
120 switch contentType {
121 case contentTypeTextPlain:
122 ppContent, err := ioutil.ReadAll(part)
123 if err != nil {
124 return textBody, htmlBody, embeddedFiles, err
125 }
126
127 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
128 case contentTypeTextHtml:
129 ppContent, err := ioutil.ReadAll(part)
130 if err != nil {
131 return textBody, htmlBody, embeddedFiles, err
132 }
133
134 htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
135 case contentTypeMultipartAlternative:
136 tb, hb, ef, err := parseMultipartAlternative(part, params["boundary"])
137 if err != nil {
138 return textBody, htmlBody, embeddedFiles, err
139 }
140
141 htmlBody += hb
142 textBody += tb
143 embeddedFiles = append(embeddedFiles, ef...)
144 default:
145 if isEmbeddedFile(part) {
146 ef, err := decodeEmbeddedFile(part)
147 if err != nil {
148 return textBody, htmlBody, embeddedFiles, err
149 }
150
151 embeddedFiles = append(embeddedFiles, ef)
152 } else {
153 return textBody, htmlBody, embeddedFiles, fmt.Errorf("Can't process multipart/related inner mime type: %s", contentType)
154 }
155 }
156 }
157
158 return textBody, htmlBody, embeddedFiles, err
159}
160
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200161func parseMultipartAlternative(msg io.Reader, boundary string) (textBody, htmlBody string, embeddedFiles []EmbeddedFile, err error) {
162 pmr := multipart.NewReader(msg, boundary)
163 for {
164 part, err := pmr.NextPart()
165
166 if err == io.EOF {
167 break
168 } else if err != nil {
169 return textBody, htmlBody, embeddedFiles, err
170 }
171
172 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
Dusan Kasanc661cc02017-04-18 10:51:51 +0200173 if err != nil {
174 return textBody, htmlBody, embeddedFiles, err
175 }
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200176
177 switch contentType {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200178 case contentTypeTextPlain:
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200179 ppContent, err := ioutil.ReadAll(part)
180 if err != nil {
181 return textBody, htmlBody, embeddedFiles, err
182 }
183
184 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +0200185 case contentTypeTextHtml:
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200186 ppContent, err := ioutil.ReadAll(part)
187 if err != nil {
188 return textBody, htmlBody, embeddedFiles, err
189 }
190
191 htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +0200192 case contentTypeMultipartRelated:
Kevin Chen9b9506a2018-05-03 22:17:38 -0400193 tb, hb, ef, err := parseMultipartRelated(part, params["boundary"])
Dusan Kasan1a966482017-04-18 10:45:25 +0200194 if err != nil {
195 return textBody, htmlBody, embeddedFiles, err
196 }
197
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200198 htmlBody += hb
199 textBody += tb
200 embeddedFiles = append(embeddedFiles, ef...)
201 default:
202 if isEmbeddedFile(part) {
203 ef, err := decodeEmbeddedFile(part)
204 if err != nil {
205 return textBody, htmlBody, embeddedFiles, err
206 }
207
208 embeddedFiles = append(embeddedFiles, ef)
209 } else {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200210 return textBody, htmlBody, embeddedFiles, fmt.Errorf("Can't process multipart/alternative inner mime type: %s", contentType)
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200211 }
212 }
213 }
214
215 return textBody, htmlBody, embeddedFiles, err
216}
217
218func parseMultipartMixed(msg io.Reader, boundary string) (textBody, htmlBody string, attachments []Attachment, embeddedFiles []EmbeddedFile, err error) {
219 mr := multipart.NewReader(msg, boundary)
220 for {
221 part, err := mr.NextPart()
222 if err == io.EOF {
223 break
224 } else if err != nil {
225 return textBody, htmlBody, attachments, embeddedFiles, err
226 }
227
228 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
229 if err != nil {
230 return textBody, htmlBody, attachments, embeddedFiles, err
231 }
232
Dusan Kasan45ca2642017-04-18 10:39:35 +0200233 if contentType == contentTypeMultipartAlternative {
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200234 textBody, htmlBody, embeddedFiles, err = parseMultipartAlternative(part, params["boundary"])
235 if err != nil {
236 return textBody, htmlBody, attachments, embeddedFiles, err
237 }
Kevin Chen5dc5bc82018-05-03 22:07:35 -0400238 } else if contentType == contentTypeMultipartRelated {
239 textBody, htmlBody, embeddedFiles, err = parseMultipartRelated(part, params["boundary"])
240 if err != nil {
241 return textBody, htmlBody, attachments, embeddedFiles, err
242 }
Maya Rashisha3803bd2019-06-08 17:53:21 +0300243 } else if contentType == contentTypeTextPlain {
244 ppContent, err := ioutil.ReadAll(part)
245 if err != nil {
246 return textBody, htmlBody, attachments, embeddedFiles, err
247 }
248
249 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200250 } else if isAttachment(part) {
251 at, err := decodeAttachment(part)
252 if err != nil {
253 return textBody, htmlBody, attachments, embeddedFiles, err
254 }
255
256 attachments = append(attachments, at)
257 } else {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200258 return textBody, htmlBody, attachments, embeddedFiles, fmt.Errorf("Unknown multipart/mixed nested mime type: %s", contentType)
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200259 }
260 }
261
262 return textBody, htmlBody, attachments, embeddedFiles, err
Dusan Kasan17e497e2017-04-10 22:44:22 +0200263}
264
Dusan Kasanf4376a62017-05-23 21:03:55 +0200265func decodeMimeSentence(s string) string {
Dusan Kasan17e497e2017-04-10 22:44:22 +0200266 result := []string{}
267 ss := strings.Split(s, " ")
268
269 for _, word := range ss {
270 dec := new(mime.WordDecoder)
271 w, err := dec.Decode(word)
272 if err != nil {
273 if len(result) == 0 {
274 w = word
275 } else {
276 w = " " + word
277 }
278 }
279
280 result = append(result, w)
281 }
282
Dusan Kasanf4376a62017-05-23 21:03:55 +0200283 return strings.Join(result, "")
Dusan Kasan17e497e2017-04-10 22:44:22 +0200284}
285
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200286func decodeHeaderMime(header mail.Header) (mail.Header, error) {
Dusan Kasan17e497e2017-04-10 22:44:22 +0200287 parsedHeader := map[string][]string{}
288
289 for headerName, headerData := range header {
290
291 parsedHeaderData := []string{}
292 for _, headerValue := range headerData {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200293 parsedHeaderData = append(parsedHeaderData, decodeMimeSentence(headerValue))
Dusan Kasan17e497e2017-04-10 22:44:22 +0200294 }
295
296 parsedHeader[headerName] = parsedHeaderData
297 }
298
299 return mail.Header(parsedHeader), nil
300}
301
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200302func isEmbeddedFile(part *multipart.Part) bool {
303 return part.Header.Get("Content-Transfer-Encoding") != ""
304}
305
306func decodeEmbeddedFile(part *multipart.Part) (ef EmbeddedFile, err error) {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200307 cid := decodeMimeSentence(part.Header.Get("Content-Id"))
Dusan Kasan428369f2020-02-24 00:47:31 +0100308 decoded, err := decodeContent(part, part.Header.Get("Content-Transfer-Encoding"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200309 if err != nil {
310 return
311 }
312
313 ef.CID = strings.Trim(cid, "<>")
314 ef.Data = decoded
315 ef.ContentType = part.Header.Get("Content-Type")
316
317 return
318}
319
320func isAttachment(part *multipart.Part) bool {
321 return part.FileName() != ""
322}
323
324func decodeAttachment(part *multipart.Part) (at Attachment, err error) {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200325 filename := decodeMimeSentence(part.FileName())
Dusan Kasan428369f2020-02-24 00:47:31 +0100326 decoded, err := decodeContent(part, part.Header.Get("Content-Transfer-Encoding"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200327 if err != nil {
328 return
329 }
330
331 at.Filename = filename
332 at.Data = decoded
333 at.ContentType = strings.Split(part.Header.Get("Content-Type"), ";")[0]
334
335 return
336}
337
Dusan Kasan428369f2020-02-24 00:47:31 +0100338func decodeContent(content io.Reader, encoding string) (io.Reader, error) {
339 switch encoding {
340 case "base64":
341 decoded := base64.NewDecoder(base64.StdEncoding, content)
342 b, err := ioutil.ReadAll(decoded)
343 if err != nil {
344 return nil, err
345 }
346
347 return bytes.NewReader(b), nil
Dusan Kasan3325e732020-04-04 11:46:04 +0200348 case "7bit":
349 dd, err := ioutil.ReadAll(content)
350 if err != nil {
351 return nil, err
352 }
353
354 return bytes.NewReader(dd), nil
Dusan Kasan428369f2020-02-24 00:47:31 +0100355 case "":
356 return content, nil
357 default:
358 return nil, fmt.Errorf("unknown encoding: %s", encoding)
359 }
360}
361
Dusan Kasane668cf22017-04-18 12:56:51 +0200362type headerParser struct {
363 header *mail.Header
Dusan Kasanb974c632017-04-18 12:58:42 +0200364 err error
Dusan Kasane668cf22017-04-18 12:56:51 +0200365}
366
367func (hp headerParser) parseAddress(s string) (ma *mail.Address) {
368 if hp.err != nil {
369 return nil
370 }
371
372 if strings.Trim(s, " \n") != "" {
373 ma, hp.err = mail.ParseAddress(s)
374
375 return ma
376 }
377
378 return nil
379}
380
381func (hp headerParser) parseAddressList(s string) (ma []*mail.Address) {
382 if hp.err != nil {
383 return
384 }
385
386 if strings.Trim(s, " \n") != "" {
387 ma, hp.err = mail.ParseAddressList(s)
388 return
389 }
390
391 return
392}
393
394func (hp headerParser) parseTime(s string) (t time.Time) {
Dusan Kasanb974c632017-04-18 12:58:42 +0200395 if hp.err != nil || s == "" {
Dusan Kasane668cf22017-04-18 12:56:51 +0200396 return
397 }
398
Dusan Kasan88226cf2020-04-04 11:13:06 +0200399 formats := []string{
400 time.RFC1123Z,
401 "Mon, 2 Jan 2006 15:04:05 -0700",
402 time.RFC1123Z + " (MST)",
403 "Mon, 2 Jan 2006 15:04:05 -0700 (MST)",
Dusan Kasane668cf22017-04-18 12:56:51 +0200404 }
405
Dusan Kasan88226cf2020-04-04 11:13:06 +0200406 for _, format := range formats {
407 t, hp.err = time.Parse(format, s)
408 if hp.err == nil {
409 return
410 }
411 }
Dusan Kasane668cf22017-04-18 12:56:51 +0200412
413 return
414}
415
416func (hp headerParser) parseMessageId(s string) string {
417 if hp.err != nil {
418 return ""
419 }
420
421 return strings.Trim(s, "<> ")
422}
423
424func (hp headerParser) parseMessageIdList(s string) (result []string) {
425 if hp.err != nil {
426 return
427 }
428
429 for _, p := range strings.Split(s, " ") {
430 if strings.Trim(p, " \n") != "" {
431 result = append(result, hp.parseMessageId(p))
432 }
433 }
434
435 return
436}
437
Dusan Kasan1a966482017-04-18 10:45:25 +0200438// Attachment with filename, content type and data (as a io.Reader)
Dusan Kasan17e497e2017-04-10 22:44:22 +0200439type Attachment struct {
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200440 Filename string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200441 ContentType string
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200442 Data io.Reader
Dusan Kasan17e497e2017-04-10 22:44:22 +0200443}
444
Dusan Kasan1a966482017-04-18 10:45:25 +0200445// EmbeddedFile with content id, content type and data (as a io.Reader)
Dusan Kasan17e497e2017-04-10 22:44:22 +0200446type EmbeddedFile struct {
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200447 CID string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200448 ContentType string
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200449 Data io.Reader
Dusan Kasan17e497e2017-04-10 22:44:22 +0200450}
451
Dusan Kasan1a966482017-04-18 10:45:25 +0200452// Email with fields for all the headers defined in RFC5322 with it's attachments and
Dusan Kasan17e497e2017-04-10 22:44:22 +0200453type Email struct {
454 Header mail.Header
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200455
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200456 Subject string
457 Sender *mail.Address
458 From []*mail.Address
459 ReplyTo []*mail.Address
460 To []*mail.Address
461 Cc []*mail.Address
462 Bcc []*mail.Address
463 Date time.Time
464 MessageID string
465 InReplyTo []string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200466 References []string
467
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200468 ResentFrom []*mail.Address
469 ResentSender *mail.Address
470 ResentTo []*mail.Address
471 ResentDate time.Time
472 ResentCc []*mail.Address
473 ResentBcc []*mail.Address
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200474 ResentMessageID string
475
Dusan Kasan428369f2020-02-24 00:47:31 +0100476 ContentType string
477 Content io.Reader
478
Dusan Kasan17e497e2017-04-10 22:44:22 +0200479 HTMLBody string
480 TextBody string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200481
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200482 Attachments []Attachment
Dusan Kasan17e497e2017-04-10 22:44:22 +0200483 EmbeddedFiles []EmbeddedFile
Dusan Kasan428369f2020-02-24 00:47:31 +0100484}