blob: 86e360bb8b5639d0aaf8367b81276cc52d2b1c7e [file] [log] [blame]
Dusan Kasan17e497e2017-04-10 22:44:22 +02001package parsemail
2
3import (
Dusan Kasan17e497e2017-04-10 22:44:22 +02004 "bytes"
Dusan Kasan4595dfe2017-04-13 00:38:24 +02005 "encoding/base64"
Dusan Kasan4595dfe2017-04-13 00:38:24 +02006 "fmt"
7 "io"
8 "io/ioutil"
9 "mime"
10 "mime/multipart"
11 "net/mail"
12 "strings"
13 "time"
Dusan Kasan17e497e2017-04-10 22:44:22 +020014)
15
Dusan Kasan45ca2642017-04-18 10:39:35 +020016const contentTypeMultipartMixed = "multipart/mixed"
17const contentTypeMultipartAlternative = "multipart/alternative"
18const contentTypeMultipartRelated = "multipart/related"
19const contentTypeTextHtml = "text/html"
20const contentTypeTextPlain = "text/plain"
Dusan Kasan17e497e2017-04-10 22:44:22 +020021
Dusan Kasan45ca2642017-04-18 10:39:35 +020022// Parse an email message read from io.Reader into parsemail.Email struct
Dusan Kasanb49ceb62017-04-13 00:00:36 +020023func Parse(r io.Reader) (email Email, err error) {
Dusan Kasan4595dfe2017-04-13 00:38:24 +020024 msg, err := mail.ReadMessage(r)
Dusan Kasan17e497e2017-04-10 22:44:22 +020025 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020026 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020027 }
28
Dusan Kasanb49ceb62017-04-13 00:00:36 +020029 email, err = createEmailFromHeader(msg.Header)
Dusan Kasan17e497e2017-04-10 22:44:22 +020030 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020031 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020032 }
33
Dusan Kasan428369f2020-02-24 00:47:31 +010034 email.ContentType = msg.Header.Get("Content-Type")
35 contentType, params, err := parseContentType(email.ContentType)
Dusan Kasan17e497e2017-04-10 22:44:22 +020036 if err != nil {
Dusan Kasanb49ceb62017-04-13 00:00:36 +020037 return
Dusan Kasan17e497e2017-04-10 22:44:22 +020038 }
39
Dusan Kasan428369f2020-02-24 00:47:31 +010040
Dusan Kasanb49ceb62017-04-13 00:00:36 +020041 switch contentType {
Dusan Kasan45ca2642017-04-18 10:39:35 +020042 case contentTypeMultipartMixed:
Dusan Kasan17e497e2017-04-10 22:44:22 +020043 email.TextBody, email.HTMLBody, email.Attachments, email.EmbeddedFiles, err = parseMultipartMixed(msg.Body, params["boundary"])
Dusan Kasan45ca2642017-04-18 10:39:35 +020044 case contentTypeMultipartAlternative:
Dusan Kasan17e497e2017-04-10 22:44:22 +020045 email.TextBody, email.HTMLBody, email.EmbeddedFiles, err = parseMultipartAlternative(msg.Body, params["boundary"])
Obi Symons89230f42020-04-04 14:32:11 +110046 case contentTypeMultipartRelated:
47 email.TextBody, email.HTMLBody, email.EmbeddedFiles, err = parseMultipartRelated(msg.Body, params["boundary"])
Dusan Kasan45ca2642017-04-18 10:39:35 +020048 case contentTypeTextPlain:
Dusan Kasan17e497e2017-04-10 22:44:22 +020049 message, _ := ioutil.ReadAll(msg.Body)
50 email.TextBody = strings.TrimSuffix(string(message[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +020051 case contentTypeTextHtml:
Dusan Kasan17e497e2017-04-10 22:44:22 +020052 message, _ := ioutil.ReadAll(msg.Body)
53 email.HTMLBody = strings.TrimSuffix(string(message[:]), "\n")
Dusan Kasanb49ceb62017-04-13 00:00:36 +020054 default:
Dusan Kasan428369f2020-02-24 00:47:31 +010055 email.Content, err = decodeContent(msg.Body, msg.Header.Get("Content-Transfer-Encoding"))
Dusan Kasan17e497e2017-04-10 22:44:22 +020056 }
57
Dusan Kasanb49ceb62017-04-13 00:00:36 +020058 return
59}
60
61func createEmailFromHeader(header mail.Header) (email Email, err error) {
Dusan Kasane668cf22017-04-18 12:56:51 +020062 hp := headerParser{header: &header}
63
Dusan Kasanf4376a62017-05-23 21:03:55 +020064 email.Subject = decodeMimeSentence(header.Get("Subject"))
Dusan Kasane668cf22017-04-18 12:56:51 +020065 email.From = hp.parseAddressList(header.Get("From"))
66 email.Sender = hp.parseAddress(header.Get("Sender"))
67 email.ReplyTo = hp.parseAddressList(header.Get("Reply-To"))
68 email.To = hp.parseAddressList(header.Get("To"))
69 email.Cc = hp.parseAddressList(header.Get("Cc"))
70 email.Bcc = hp.parseAddressList(header.Get("Bcc"))
71 email.Date = hp.parseTime(header.Get("Date"))
72 email.ResentFrom = hp.parseAddressList(header.Get("Resent-From"))
73 email.ResentSender = hp.parseAddress(header.Get("Resent-Sender"))
74 email.ResentTo = hp.parseAddressList(header.Get("Resent-To"))
75 email.ResentCc = hp.parseAddressList(header.Get("Resent-Cc"))
76 email.ResentBcc = hp.parseAddressList(header.Get("Resent-Bcc"))
77 email.ResentMessageID = hp.parseMessageId(header.Get("Resent-Message-ID"))
78 email.MessageID = hp.parseMessageId(header.Get("Message-ID"))
79 email.InReplyTo = hp.parseMessageIdList(header.Get("In-Reply-To"))
80 email.References = hp.parseMessageIdList(header.Get("References"))
81 email.ResentDate = hp.parseTime(header.Get("Resent-Date"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +020082
Dusan Kasane668cf22017-04-18 12:56:51 +020083 if hp.err != nil {
84 err = hp.err
Dusan Kasanb49ceb62017-04-13 00:00:36 +020085 return
86 }
87
Dusan Kasanb49ceb62017-04-13 00:00:36 +020088 //decode whole header for easier access to extra fields
89 //todo: should we decode? aren't only standard fields mime encoded?
90 email.Header, err = decodeHeaderMime(header)
91 if err != nil {
92 return
93 }
94
95 return
96}
97
98func parseContentType(contentTypeHeader string) (contentType string, params map[string]string, err error) {
99 if contentTypeHeader == "" {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200100 contentType = contentTypeTextPlain
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200101 return
102 }
103
104 return mime.ParseMediaType(contentTypeHeader)
105}
106
Kevin Chen5dc5bc82018-05-03 22:07:35 -0400107func parseMultipartRelated(msg io.Reader, boundary string) (textBody, htmlBody string, embeddedFiles []EmbeddedFile, err error) {
108 pmr := multipart.NewReader(msg, boundary)
109 for {
110 part, err := pmr.NextPart()
111
112 if err == io.EOF {
113 break
114 } else if err != nil {
115 return textBody, htmlBody, embeddedFiles, err
116 }
117
118 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
119 if err != nil {
120 return textBody, htmlBody, embeddedFiles, err
121 }
122
123 switch contentType {
124 case contentTypeTextPlain:
125 ppContent, err := ioutil.ReadAll(part)
126 if err != nil {
127 return textBody, htmlBody, embeddedFiles, err
128 }
129
130 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
131 case contentTypeTextHtml:
132 ppContent, err := ioutil.ReadAll(part)
133 if err != nil {
134 return textBody, htmlBody, embeddedFiles, err
135 }
136
137 htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
138 case contentTypeMultipartAlternative:
139 tb, hb, ef, err := parseMultipartAlternative(part, params["boundary"])
140 if err != nil {
141 return textBody, htmlBody, embeddedFiles, err
142 }
143
144 htmlBody += hb
145 textBody += tb
146 embeddedFiles = append(embeddedFiles, ef...)
147 default:
148 if isEmbeddedFile(part) {
149 ef, err := decodeEmbeddedFile(part)
150 if err != nil {
151 return textBody, htmlBody, embeddedFiles, err
152 }
153
154 embeddedFiles = append(embeddedFiles, ef)
155 } else {
156 return textBody, htmlBody, embeddedFiles, fmt.Errorf("Can't process multipart/related inner mime type: %s", contentType)
157 }
158 }
159 }
160
161 return textBody, htmlBody, embeddedFiles, err
162}
163
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200164func parseMultipartAlternative(msg io.Reader, boundary string) (textBody, htmlBody string, embeddedFiles []EmbeddedFile, err error) {
165 pmr := multipart.NewReader(msg, boundary)
166 for {
167 part, err := pmr.NextPart()
168
169 if err == io.EOF {
170 break
171 } else if err != nil {
172 return textBody, htmlBody, embeddedFiles, err
173 }
174
175 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
Dusan Kasanc661cc02017-04-18 10:51:51 +0200176 if err != nil {
177 return textBody, htmlBody, embeddedFiles, err
178 }
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200179
180 switch contentType {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200181 case contentTypeTextPlain:
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200182 ppContent, err := ioutil.ReadAll(part)
183 if err != nil {
184 return textBody, htmlBody, embeddedFiles, err
185 }
186
187 textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +0200188 case contentTypeTextHtml:
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200189 ppContent, err := ioutil.ReadAll(part)
190 if err != nil {
191 return textBody, htmlBody, embeddedFiles, err
192 }
193
194 htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
Dusan Kasan45ca2642017-04-18 10:39:35 +0200195 case contentTypeMultipartRelated:
Kevin Chen9b9506a2018-05-03 22:17:38 -0400196 tb, hb, ef, err := parseMultipartRelated(part, params["boundary"])
Dusan Kasan1a966482017-04-18 10:45:25 +0200197 if err != nil {
198 return textBody, htmlBody, embeddedFiles, err
199 }
200
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200201 htmlBody += hb
202 textBody += tb
203 embeddedFiles = append(embeddedFiles, ef...)
204 default:
205 if isEmbeddedFile(part) {
206 ef, err := decodeEmbeddedFile(part)
207 if err != nil {
208 return textBody, htmlBody, embeddedFiles, err
209 }
210
211 embeddedFiles = append(embeddedFiles, ef)
212 } else {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200213 return textBody, htmlBody, embeddedFiles, fmt.Errorf("Can't process multipart/alternative inner mime type: %s", contentType)
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200214 }
215 }
216 }
217
218 return textBody, htmlBody, embeddedFiles, err
219}
220
221func parseMultipartMixed(msg io.Reader, boundary string) (textBody, htmlBody string, attachments []Attachment, embeddedFiles []EmbeddedFile, err error) {
222 mr := multipart.NewReader(msg, boundary)
223 for {
224 part, err := mr.NextPart()
225 if err == io.EOF {
226 break
227 } else if err != nil {
228 return textBody, htmlBody, attachments, embeddedFiles, err
229 }
230
231 contentType, params, err := mime.ParseMediaType(part.Header.Get("Content-Type"))
232 if err != nil {
233 return textBody, htmlBody, attachments, embeddedFiles, err
234 }
235
Dusan Kasan45ca2642017-04-18 10:39:35 +0200236 if contentType == contentTypeMultipartAlternative {
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200237 textBody, htmlBody, embeddedFiles, err = parseMultipartAlternative(part, params["boundary"])
238 if err != nil {
239 return textBody, htmlBody, attachments, embeddedFiles, err
240 }
Kevin Chen5dc5bc82018-05-03 22:07:35 -0400241 } else if contentType == contentTypeMultipartRelated {
242 textBody, htmlBody, embeddedFiles, err = parseMultipartRelated(part, params["boundary"])
243 if err != nil {
244 return textBody, htmlBody, attachments, embeddedFiles, err
245 }
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200246 } else if isAttachment(part) {
247 at, err := decodeAttachment(part)
248 if err != nil {
249 return textBody, htmlBody, attachments, embeddedFiles, err
250 }
251
252 attachments = append(attachments, at)
253 } else {
Dusan Kasan45ca2642017-04-18 10:39:35 +0200254 return textBody, htmlBody, attachments, embeddedFiles, fmt.Errorf("Unknown multipart/mixed nested mime type: %s", contentType)
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200255 }
256 }
257
258 return textBody, htmlBody, attachments, embeddedFiles, err
Dusan Kasan17e497e2017-04-10 22:44:22 +0200259}
260
Dusan Kasanf4376a62017-05-23 21:03:55 +0200261func decodeMimeSentence(s string) string {
Dusan Kasan17e497e2017-04-10 22:44:22 +0200262 result := []string{}
263 ss := strings.Split(s, " ")
264
265 for _, word := range ss {
266 dec := new(mime.WordDecoder)
267 w, err := dec.Decode(word)
268 if err != nil {
269 if len(result) == 0 {
270 w = word
271 } else {
272 w = " " + word
273 }
274 }
275
276 result = append(result, w)
277 }
278
Dusan Kasanf4376a62017-05-23 21:03:55 +0200279 return strings.Join(result, "")
Dusan Kasan17e497e2017-04-10 22:44:22 +0200280}
281
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200282func decodeHeaderMime(header mail.Header) (mail.Header, error) {
Dusan Kasan17e497e2017-04-10 22:44:22 +0200283 parsedHeader := map[string][]string{}
284
285 for headerName, headerData := range header {
286
287 parsedHeaderData := []string{}
288 for _, headerValue := range headerData {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200289 parsedHeaderData = append(parsedHeaderData, decodeMimeSentence(headerValue))
Dusan Kasan17e497e2017-04-10 22:44:22 +0200290 }
291
292 parsedHeader[headerName] = parsedHeaderData
293 }
294
295 return mail.Header(parsedHeader), nil
296}
297
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200298func isEmbeddedFile(part *multipart.Part) bool {
299 return part.Header.Get("Content-Transfer-Encoding") != ""
300}
301
302func decodeEmbeddedFile(part *multipart.Part) (ef EmbeddedFile, err error) {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200303 cid := decodeMimeSentence(part.Header.Get("Content-Id"))
Dusan Kasan428369f2020-02-24 00:47:31 +0100304 decoded, err := decodeContent(part, part.Header.Get("Content-Transfer-Encoding"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200305 if err != nil {
306 return
307 }
308
309 ef.CID = strings.Trim(cid, "<>")
310 ef.Data = decoded
311 ef.ContentType = part.Header.Get("Content-Type")
312
313 return
314}
315
316func isAttachment(part *multipart.Part) bool {
317 return part.FileName() != ""
318}
319
320func decodeAttachment(part *multipart.Part) (at Attachment, err error) {
Dusan Kasanf4376a62017-05-23 21:03:55 +0200321 filename := decodeMimeSentence(part.FileName())
Dusan Kasan428369f2020-02-24 00:47:31 +0100322 decoded, err := decodeContent(part, part.Header.Get("Content-Transfer-Encoding"))
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200323 if err != nil {
324 return
325 }
326
327 at.Filename = filename
328 at.Data = decoded
329 at.ContentType = strings.Split(part.Header.Get("Content-Type"), ";")[0]
330
331 return
332}
333
Dusan Kasan428369f2020-02-24 00:47:31 +0100334func decodeContent(content io.Reader, encoding string) (io.Reader, error) {
335 switch encoding {
336 case "base64":
337 decoded := base64.NewDecoder(base64.StdEncoding, content)
338 b, err := ioutil.ReadAll(decoded)
339 if err != nil {
340 return nil, err
341 }
342
343 return bytes.NewReader(b), nil
344 case "":
345 return content, nil
346 default:
347 return nil, fmt.Errorf("unknown encoding: %s", encoding)
348 }
349}
350
Dusan Kasane668cf22017-04-18 12:56:51 +0200351type headerParser struct {
352 header *mail.Header
Dusan Kasanb974c632017-04-18 12:58:42 +0200353 err error
Dusan Kasane668cf22017-04-18 12:56:51 +0200354}
355
356func (hp headerParser) parseAddress(s string) (ma *mail.Address) {
357 if hp.err != nil {
358 return nil
359 }
360
361 if strings.Trim(s, " \n") != "" {
362 ma, hp.err = mail.ParseAddress(s)
363
364 return ma
365 }
366
367 return nil
368}
369
370func (hp headerParser) parseAddressList(s string) (ma []*mail.Address) {
371 if hp.err != nil {
372 return
373 }
374
375 if strings.Trim(s, " \n") != "" {
376 ma, hp.err = mail.ParseAddressList(s)
377 return
378 }
379
380 return
381}
382
383func (hp headerParser) parseTime(s string) (t time.Time) {
Dusan Kasanb974c632017-04-18 12:58:42 +0200384 if hp.err != nil || s == "" {
Dusan Kasane668cf22017-04-18 12:56:51 +0200385 return
386 }
387
388 t, hp.err = time.Parse(time.RFC1123Z, s)
389 if hp.err == nil {
390 return t
391 }
392
393 t, hp.err = time.Parse("Mon, 2 Jan 2006 15:04:05 -0700", s)
394
395 return
396}
397
398func (hp headerParser) parseMessageId(s string) string {
399 if hp.err != nil {
400 return ""
401 }
402
403 return strings.Trim(s, "<> ")
404}
405
406func (hp headerParser) parseMessageIdList(s string) (result []string) {
407 if hp.err != nil {
408 return
409 }
410
411 for _, p := range strings.Split(s, " ") {
412 if strings.Trim(p, " \n") != "" {
413 result = append(result, hp.parseMessageId(p))
414 }
415 }
416
417 return
418}
419
Dusan Kasan1a966482017-04-18 10:45:25 +0200420// Attachment with filename, content type and data (as a io.Reader)
Dusan Kasan17e497e2017-04-10 22:44:22 +0200421type Attachment struct {
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200422 Filename string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200423 ContentType string
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200424 Data io.Reader
Dusan Kasan17e497e2017-04-10 22:44:22 +0200425}
426
Dusan Kasan1a966482017-04-18 10:45:25 +0200427// EmbeddedFile with content id, content type and data (as a io.Reader)
Dusan Kasan17e497e2017-04-10 22:44:22 +0200428type EmbeddedFile struct {
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200429 CID string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200430 ContentType string
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200431 Data io.Reader
Dusan Kasan17e497e2017-04-10 22:44:22 +0200432}
433
Dusan Kasan1a966482017-04-18 10:45:25 +0200434// Email with fields for all the headers defined in RFC5322 with it's attachments and
Dusan Kasan17e497e2017-04-10 22:44:22 +0200435type Email struct {
436 Header mail.Header
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200437
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200438 Subject string
439 Sender *mail.Address
440 From []*mail.Address
441 ReplyTo []*mail.Address
442 To []*mail.Address
443 Cc []*mail.Address
444 Bcc []*mail.Address
445 Date time.Time
446 MessageID string
447 InReplyTo []string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200448 References []string
449
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200450 ResentFrom []*mail.Address
451 ResentSender *mail.Address
452 ResentTo []*mail.Address
453 ResentDate time.Time
454 ResentCc []*mail.Address
455 ResentBcc []*mail.Address
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200456 ResentMessageID string
457
Dusan Kasan428369f2020-02-24 00:47:31 +0100458 ContentType string
459 Content io.Reader
460
Dusan Kasan17e497e2017-04-10 22:44:22 +0200461 HTMLBody string
462 TextBody string
Dusan Kasanb49ceb62017-04-13 00:00:36 +0200463
Dusan Kasan4595dfe2017-04-13 00:38:24 +0200464 Attachments []Attachment
Dusan Kasan17e497e2017-04-10 22:44:22 +0200465 EmbeddedFiles []EmbeddedFile
Dusan Kasan428369f2020-02-24 00:47:31 +0100466}