blob: e14075c7c21fda271d3d46d32d316b2be6a80b5d [file] [log] [blame]
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +01001# Copyright 2016 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
Copybara854996b2021-09-07 19:36:02 +00004
5"""Functions that format or parse email messages in Monorail.
6
7Specifically, this module has the logic for generating various email
8header lines that help match inbound and outbound email to the project
9and artifact that generated it.
10"""
11from __future__ import print_function
12from __future__ import division
13from __future__ import absolute_import
14
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +010015import hashlib
Copybara854996b2021-09-07 19:36:02 +000016import hmac
17import logging
18import re
Copybara854996b2021-09-07 19:36:02 +000019
20import six
21
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +010022if six.PY2:
23 import rfc822
24else:
25 import email.utils
26
Copybara854996b2021-09-07 19:36:02 +000027from google.appengine.api import app_identity
28
29import settings
30from framework import framework_constants
31from services import client_config_svc
32from services import secrets_svc
33
34# TODO(jrobbins): Parsing very large messages is slow, and we are not going
35# to handle attachments at first, so there is no reason to consider large
36# emails.
37MAX_BODY_SIZE = 100 * 1024
38MAX_HEADER_CHARS_CONSIDERED = 255
39
40
41def _checkEmailHeaderPrefix(key):
42 """Ensures that a given email header starts with X-Alert2Monorail prefix."""
43 # this is to catch typos in the email header prefix and raises an exception
44 # during package loading time.
45 assert key.startswith('X-Alert2Monorail')
46 return key
47
48
49class AlertEmailHeader(object):
50 """A list of the email header keys supported by Alert2Monorail."""
51 # pylint: disable=bad-whitespace
52 #
53 # The prefix has been hard-coded without string substitution to make them
54 # searchable with the header keys.
55 INCIDENT_ID = 'X-Incident-Id'
56 OWNER = _checkEmailHeaderPrefix('X-Alert2Monorail-owner')
57 CC = _checkEmailHeaderPrefix('X-Alert2Monorail-cc')
58 PRIORITY = _checkEmailHeaderPrefix('X-Alert2Monorail-priority')
59 STATUS = _checkEmailHeaderPrefix('X-Alert2Monorail-status')
60 COMPONENT = _checkEmailHeaderPrefix('X-Alert2Monorail-component')
61 OS = _checkEmailHeaderPrefix('X-Alert2Monorail-os')
62 TYPE = _checkEmailHeaderPrefix('X-Alert2Monorail-type')
63 LABEL = _checkEmailHeaderPrefix('X-Alert2Monorail-label')
64
65
66def IsBodyTooBigToParse(body):
67 """Return True if the email message body is too big to process."""
68 return len(body) > MAX_BODY_SIZE
69
70
71def IsProjectAddressOnToLine(project_addr, to_addrs):
72 """Return True if an email was explicitly sent directly to us."""
73 return project_addr in to_addrs
74
75
76def ParseEmailMessage(msg):
77 """Parse the given MessageRouterMessage and return relevant fields.
78
79 Args:
80 msg: email.message.Message object for the email message sent to us.
81
82 Returns:
83 A tuple: from_addr, to_addrs, cc_addrs, references,
84 incident_id, subject, body.
85 """
86 # Ignore messages that are probably not from humans, see:
87 # http://google.com/search?q=precedence+bulk+junk
88 precedence = msg.get('precedence', '')
89 if precedence.lower() in ['bulk', 'junk']:
90 logging.info('Precedence: %r indicates an autoresponder', precedence)
91 return '', [], [], '', '', '', ''
92
93 from_addrs = _ExtractAddrs(msg.get('from', ''))
94 if from_addrs:
95 from_addr = from_addrs[0]
96 else:
97 from_addr = ''
98
99 to_addrs = _ExtractAddrs(msg.get('to', ''))
100 cc_addrs = _ExtractAddrs(msg.get('cc', ''))
101
102 in_reply_to = msg.get('in-reply-to', '')
103 incident_id = msg.get(AlertEmailHeader.INCIDENT_ID, '')
104 references = msg.get('references', '').split()
105 references = list({ref for ref in [in_reply_to] + references if ref})
106 subject = _StripSubjectPrefixes(msg.get('subject', ''))
107
108 body = u''
109 for part in msg.walk():
110 # We only process plain text emails.
111 if part.get_content_type() == 'text/plain':
112 body = part.get_payload(decode=True)
113 if not isinstance(body, six.text_type):
114 body = body.decode('utf-8')
115 break # Only consider the first text part.
116
117 return (from_addr, to_addrs, cc_addrs, references, incident_id, subject,
118 body)
119
120
121def _ExtractAddrs(header_value):
122 """Given a message header value, return email address found there."""
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100123 if six.PY2:
124 friendly_addr_pairs = list(rfc822.AddressList(header_value))
125 else:
126 friendly_addr_pairs = email.utils.getaddresses([header_value])
Copybara854996b2021-09-07 19:36:02 +0000127 return [addr for _friendly, addr in friendly_addr_pairs]
128
129
130def _StripSubjectPrefixes(subject):
131 """Strip off any 'Re:', 'Fwd:', etc. subject line prefixes."""
132 prefix = _FindSubjectPrefix(subject)
133 while prefix:
134 subject = subject[len(prefix):].strip()
135 prefix = _FindSubjectPrefix(subject)
136
137 return subject
138
139
140def _FindSubjectPrefix(subject):
141 """If the given subject starts with a prefix, return that prefix."""
142 for prefix in ['re:', 'aw:', 'fwd:', 'fw:']:
143 if subject.lower().startswith(prefix):
144 return prefix
145
146 return None
147
148
149def MailDomain():
150 """Return the domain name where this app can recieve email."""
151 if settings.unit_test_mode:
152 return 'testbed-test.appspotmail.com'
153
154 # If running on a GAFYD domain, you must define an app alias on the
155 # Application Settings admin web page. If you cannot reserve the matching
156 # APP_ID for the alias, then specify it in settings.mail_domain.
157 if settings.mail_domain:
158 return settings.mail_domain
159
160 app_id = app_identity.get_application_id()
161 if ':' in app_id:
162 app_id = app_id.split(':')[-1]
163
164 return '%s.appspotmail.com' % app_id
165
166
167def FormatFriendly(commenter_view, sender, reveal_addr):
168 """Format the From: line to include the commenter's friendly name if given."""
169 if commenter_view:
170 site_name = settings.site_name.lower()
171 if commenter_view.email in client_config_svc.GetServiceAccountMap():
172 friendly = commenter_view.display_name
173 elif reveal_addr:
174 friendly = commenter_view.email
175 else:
176 friendly = u'%s\u2026@%s' % (
177 commenter_view.obscured_username, commenter_view.domain)
178 if '@' in sender:
179 sender_username, sender_domain = sender.split('@', 1)
180 sender = '%s+v2.%d@%s' % (
181 sender_username, commenter_view.user_id, sender_domain)
182 friendly = friendly.split('@')[0]
183 return '%s via %s <%s>' % (friendly, site_name, sender)
184 else:
185 return sender
186
187
188def NoReplyAddress(commenter_view=None, reveal_addr=False):
189 """Return an address that ignores all messages sent to it."""
190 # Note: We use "no_reply" with an underscore to avoid potential conflict
191 # with any project name. Project names cannot have underscores.
192 # Note: This does not take branded domains into account, but this address
193 # is only used for email error messages and in the reply-to address
194 # when the user is not allowed to reply.
195 sender = 'no_reply@%s' % MailDomain()
196 return FormatFriendly(commenter_view, sender, reveal_addr)
197
198
199def FormatFromAddr(project, commenter_view=None, reveal_addr=False,
200 can_reply_to=True):
201 """Return a string to be used on the email From: line.
202
203 Args:
204 project: Project PB for the project that the email is sent from.
205 commenter_view: Optional UserView of the user who made a comment. We use
206 the user's (potentially obscured) email address as their friendly name.
207 reveal_addr: Optional bool. If False then the address is obscured.
208 can_reply_to: Optional bool. If True then settings.send_email_as is used,
209 otherwise settings.send_noreply_email_as is used.
210
211 Returns:
212 A string that should be used in the From: line of outbound email
213 notifications for the given project.
214 """
215 addr_format = (settings.send_email_as_format if can_reply_to
216 else settings.send_noreply_email_as_format)
217 domain = settings.branded_domains.get(
218 project.project_name, settings.branded_domains.get('*'))
219 domain = domain or 'chromium.org'
220 if domain.count('.') > 1:
221 domain = '.'.join(domain.split('.')[-2:])
222 addr = addr_format % {'domain': domain}
223 return FormatFriendly(commenter_view, addr, reveal_addr)
224
225
226def NormalizeHeader(s):
227 """Make our message-ids robust against mail client spacing and truncation."""
228 words = _StripSubjectPrefixes(s).split() # Split on any runs of whitespace.
229 normalized = ' '.join(words)
230 truncated = normalized[:MAX_HEADER_CHARS_CONSIDERED]
231 return truncated
232
233
234def MakeMessageID(to_addr, subject, from_addr):
235 """Make a unique (but deterministic) email Message-Id: value."""
236 normalized_subject = NormalizeHeader(subject)
237 if isinstance(normalized_subject, six.text_type):
238 normalized_subject = normalized_subject.encode('utf-8')
239 mail_hmac_key = secrets_svc.GetEmailKey()
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100240 to_addr_hash = hmac.new(
241 mail_hmac_key, six.ensure_binary(to_addr),
242 digestmod=hashlib.md5).hexdigest()
243 subject_hash = hmac.new(
244 mail_hmac_key,
245 six.ensure_binary(normalized_subject),
246 digestmod=hashlib.md5).hexdigest()
Copybara854996b2021-09-07 19:36:02 +0000247 return '<0=%s=%s=%s@%s>' % (
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100248 to_addr_hash, subject_hash, from_addr.split('@')[0], MailDomain())
Copybara854996b2021-09-07 19:36:02 +0000249
250
251def GetReferences(to_addr, subject, seq_num, project_from_addr):
252 """Make a References: header to make this message thread properly.
253
254 Args:
255 to_addr: address that email message will be sent to.
256 subject: subject line of email message.
257 seq_num: sequence number of message in thread, e.g., 0, 1, 2, ...,
258 or None if the message is not part of a thread.
259 project_from_addr: address that the message will be sent from.
260
261 Returns:
262 A string Message-ID that does not correspond to any actual email
263 message that was ever sent, but it does serve to unite all the
264 messages that belong togther in a thread.
265 """
266 if seq_num is not None:
267 return MakeMessageID(to_addr, subject, project_from_addr)
268 else:
269 return ''
270
271
272def ValidateReferencesHeader(message_ref, project, from_addr, subject):
273 """Check that the References header is one that we could have sent.
274
275 Args:
276 message_ref: one of the References header values from the inbound email.
277 project: Project PB for the affected project.
278 from_addr: string email address that inbound email was sent from.
279 subject: string base subject line of inbound email.
280
281 Returns:
282 True if it looks like this is a reply to a message that we sent
283 to the same address that replied. Otherwise, False.
284 """
285 sender = '%s@%s' % (project.project_name, MailDomain())
286 expected_ref = MakeMessageID(from_addr, subject, sender)
287
288 # TODO(jrobbins): project option to not check from_addr.
289 # TODO(jrobbins): project inbound auth token.
290 return expected_ref == message_ref
291
292
293PROJECT_EMAIL_RE = re.compile(
294 r'(?P<project>[-a-z0-9]+)'
295 r'(\+(?P<verb>[a-z0-9]+)(\+(?P<label>[a-z0-9-]+))?)?'
296 r'@(?P<domain>[-a-z0-9.]+)')
297
298ISSUE_CHANGE_SUBJECT_RE = re.compile(
299 r'Issue (?P<local_id>[0-9]+) in '
300 r'(?P<project>[-a-z0-9]+): '
301 r'(?P<summary>.+)')
302
303ISSUE_CHANGE_COMPACT_SUBJECT_RE = re.compile(
304 r'(?P<project>[-a-z0-9]+):'
305 r'(?P<local_id>[0-9]+): '
306 r'(?P<summary>.+)')
307
308
309def IdentifyIssue(project_name, subject):
310 """Parse the artifact id from a reply and verify it is a valid issue.
311
312 Args:
313 project_name: string the project to search for the issue in.
314 subject: string email subject line received, it must match the one
315 sent. Leading prefixes like "Re:" should already have been stripped.
316
317 Returns:
318 An int local_id for the id of the issue. None if no id is found or the id
319 is not valid.
320 """
321
322 issue_project_name, local_id_str = _MatchSubject(subject)
323
324 if project_name != issue_project_name:
325 # Something is wrong with the project name.
326 return None
327
328 logging.info('project_name = %r', project_name)
329 logging.info('local_id_str = %r', local_id_str)
330
331 try:
332 local_id = int(local_id_str)
333 except (ValueError, TypeError):
334 local_id = None
335
336 return local_id
337
338
339def IdentifyProjectVerbAndLabel(project_addr):
340 # Ignore any inbound email sent to a "no_reply@" address.
341 if project_addr.startswith('no_reply@'):
342 return None, None, None
343
344 project_name = None
345 verb = None
346 label = None
347 m = PROJECT_EMAIL_RE.match(project_addr.lower())
348 if m:
349 project_name = m.group('project')
350 verb = m.group('verb')
351 label = m.group('label')
352
353 return project_name, verb, label
354
355
356def _MatchSubject(subject):
357 """Parse the project, artifact type, and artifact id from a subject line."""
358 m = (ISSUE_CHANGE_SUBJECT_RE.match(subject) or
359 ISSUE_CHANGE_COMPACT_SUBJECT_RE.match(subject))
360 if m:
361 return m.group('project'), m.group('local_id')
362
363 return None, None
364
365
366# TODO(jrobbins): For now, we strip out lines that look like quoted
367# text and then will give the user the option to see the whole email.
368# For 2.0 of this feature, we should change the Comment PB to have
369# runs of text with different properties so that the UI can present
370# "- Show quoted text -" and expand it in-line.
371
372# TODO(jrobbins): For now, we look for lines that indicate quoted
373# text (e.g., they start with ">"). But, we should also collapse
374# multiple lines that are identical to other lines in previous
375# non-deleted comments on the same issue, regardless of quote markers.
376
377
378# We cut off the message if we see something that looks like a signature and
379# it is near the bottom of the message.
380SIGNATURE_BOUNDARY_RE = re.compile(
381 r'^(([-_=]+ ?)+|'
382 r'cheers|(best |warm |kind )?regards|thx|thanks|thank you|'
383 r'Sent from my i?Phone|Sent from my iPod)'
384 r',? *$', re.I)
385
386MAX_SIGNATURE_LINES = 8
387
388FORWARD_OR_EXPLICIT_SIG_PATS = [
389 r'[^0-9a-z]+(forwarded|original) message[^0-9a-z]+\s*$',
390 r'Updates:\s*$',
391 r'Comment #\d+ on issue \d+ by \S+:',
392 # If we see this anywhere in the message, treat the rest as a signature.
393 r'--\s*$',
394 ]
395FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE = re.compile(
396 r'^(%s)(.|\n)*' % '|'.join(FORWARD_OR_EXPLICIT_SIG_PATS),
397 flags=re.MULTILINE | re.IGNORECASE)
398
399# This handles gmail well, and it's pretty broad without seeming like
400# it would cause false positives.
401QUOTE_PATS = [
402 r'^On .*\s+<\s*\S+?@[-a-z0-9.]+>\s*wrote:\s*$',
403 r'^On .* \S+?@[-a-z0-9.]+\s*wrote:\s*$',
404 r'^\S+?@[-a-z0-9.]+ \(\S+?@[-a-z0-9.]+\)\s*wrote:\s*$',
405 r'\S+?@[-a-z0-9]+.appspotmail.com\s.*wrote:\s*$',
406 r'\S+?@[-a-z0-9]+.appspotmail.com\s+.*a\s+\xc3\xa9crit\s*:\s*$',
407 r'^\d+/\d+/\d+ +<\S+@[-a-z0-9.]+>:?\s*$',
408 r'^>.*$',
409 ]
410QUOTED_BLOCKS_RE = re.compile(
411 r'(^\s*\n)*((%s)\n?)+(^\s*\n)*' % '|'.join(QUOTE_PATS),
412 flags=re.MULTILINE | re.IGNORECASE)
413
414
415def StripQuotedText(description):
416 """Strip all quoted text lines out of the given comment text."""
417 # If the rest of message is forwared text, we're done.
418 description = FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE.sub('', description)
419 # Replace each quoted block of lines and surrounding blank lines with at
420 # most one blank line.
421 description = QUOTED_BLOCKS_RE.sub('\n', description)
422
423 new_lines = description.strip().split('\n')
424 # Make another pass over the last few lines to strip out signatures.
425 sig_zone_start = max(0, len(new_lines) - MAX_SIGNATURE_LINES)
426 for idx in range(sig_zone_start, len(new_lines)):
427 line = new_lines[idx]
428 if SIGNATURE_BOUNDARY_RE.match(line):
429 # We found the likely start of a signature, just keep the lines above it.
430 new_lines = new_lines[:idx]
431 break
432
433 return '\n'.join(new_lines).strip()