blob: 2933feac6bbfe26afa4264a775081ab522e7cfb7 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001# Copyright 2016 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style
3# license that can be found in the LICENSE file or at
4# https://developers.google.com/open-source/licenses/bsd
5
6"""Functions that format or parse email messages in Monorail.
7
8Specifically, this module has the logic for generating various email
9header lines that help match inbound and outbound email to the project
10and artifact that generated it.
11"""
12from __future__ import print_function
13from __future__ import division
14from __future__ import absolute_import
15
16import hmac
17import logging
18import re
19import rfc822
20
21import six
22
23from google.appengine.api import app_identity
24
25import settings
26from framework import framework_constants
27from services import client_config_svc
28from services import secrets_svc
29
30# TODO(jrobbins): Parsing very large messages is slow, and we are not going
31# to handle attachments at first, so there is no reason to consider large
32# emails.
33MAX_BODY_SIZE = 100 * 1024
34MAX_HEADER_CHARS_CONSIDERED = 255
35
36
37def _checkEmailHeaderPrefix(key):
38 """Ensures that a given email header starts with X-Alert2Monorail prefix."""
39 # this is to catch typos in the email header prefix and raises an exception
40 # during package loading time.
41 assert key.startswith('X-Alert2Monorail')
42 return key
43
44
45class AlertEmailHeader(object):
46 """A list of the email header keys supported by Alert2Monorail."""
47 # pylint: disable=bad-whitespace
48 #
49 # The prefix has been hard-coded without string substitution to make them
50 # searchable with the header keys.
51 INCIDENT_ID = 'X-Incident-Id'
52 OWNER = _checkEmailHeaderPrefix('X-Alert2Monorail-owner')
53 CC = _checkEmailHeaderPrefix('X-Alert2Monorail-cc')
54 PRIORITY = _checkEmailHeaderPrefix('X-Alert2Monorail-priority')
55 STATUS = _checkEmailHeaderPrefix('X-Alert2Monorail-status')
56 COMPONENT = _checkEmailHeaderPrefix('X-Alert2Monorail-component')
57 OS = _checkEmailHeaderPrefix('X-Alert2Monorail-os')
58 TYPE = _checkEmailHeaderPrefix('X-Alert2Monorail-type')
59 LABEL = _checkEmailHeaderPrefix('X-Alert2Monorail-label')
60
61
62def IsBodyTooBigToParse(body):
63 """Return True if the email message body is too big to process."""
64 return len(body) > MAX_BODY_SIZE
65
66
67def IsProjectAddressOnToLine(project_addr, to_addrs):
68 """Return True if an email was explicitly sent directly to us."""
69 return project_addr in to_addrs
70
71
72def ParseEmailMessage(msg):
73 """Parse the given MessageRouterMessage and return relevant fields.
74
75 Args:
76 msg: email.message.Message object for the email message sent to us.
77
78 Returns:
79 A tuple: from_addr, to_addrs, cc_addrs, references,
80 incident_id, subject, body.
81 """
82 # Ignore messages that are probably not from humans, see:
83 # http://google.com/search?q=precedence+bulk+junk
84 precedence = msg.get('precedence', '')
85 if precedence.lower() in ['bulk', 'junk']:
86 logging.info('Precedence: %r indicates an autoresponder', precedence)
87 return '', [], [], '', '', '', ''
88
89 from_addrs = _ExtractAddrs(msg.get('from', ''))
90 if from_addrs:
91 from_addr = from_addrs[0]
92 else:
93 from_addr = ''
94
95 to_addrs = _ExtractAddrs(msg.get('to', ''))
96 cc_addrs = _ExtractAddrs(msg.get('cc', ''))
97
98 in_reply_to = msg.get('in-reply-to', '')
99 incident_id = msg.get(AlertEmailHeader.INCIDENT_ID, '')
100 references = msg.get('references', '').split()
101 references = list({ref for ref in [in_reply_to] + references if ref})
102 subject = _StripSubjectPrefixes(msg.get('subject', ''))
103
104 body = u''
105 for part in msg.walk():
106 # We only process plain text emails.
107 if part.get_content_type() == 'text/plain':
108 body = part.get_payload(decode=True)
109 if not isinstance(body, six.text_type):
110 body = body.decode('utf-8')
111 break # Only consider the first text part.
112
113 return (from_addr, to_addrs, cc_addrs, references, incident_id, subject,
114 body)
115
116
117def _ExtractAddrs(header_value):
118 """Given a message header value, return email address found there."""
119 friendly_addr_pairs = list(rfc822.AddressList(header_value))
120 return [addr for _friendly, addr in friendly_addr_pairs]
121
122
123def _StripSubjectPrefixes(subject):
124 """Strip off any 'Re:', 'Fwd:', etc. subject line prefixes."""
125 prefix = _FindSubjectPrefix(subject)
126 while prefix:
127 subject = subject[len(prefix):].strip()
128 prefix = _FindSubjectPrefix(subject)
129
130 return subject
131
132
133def _FindSubjectPrefix(subject):
134 """If the given subject starts with a prefix, return that prefix."""
135 for prefix in ['re:', 'aw:', 'fwd:', 'fw:']:
136 if subject.lower().startswith(prefix):
137 return prefix
138
139 return None
140
141
142def MailDomain():
143 """Return the domain name where this app can recieve email."""
144 if settings.unit_test_mode:
145 return 'testbed-test.appspotmail.com'
146
147 # If running on a GAFYD domain, you must define an app alias on the
148 # Application Settings admin web page. If you cannot reserve the matching
149 # APP_ID for the alias, then specify it in settings.mail_domain.
150 if settings.mail_domain:
151 return settings.mail_domain
152
153 app_id = app_identity.get_application_id()
154 if ':' in app_id:
155 app_id = app_id.split(':')[-1]
156
157 return '%s.appspotmail.com' % app_id
158
159
160def FormatFriendly(commenter_view, sender, reveal_addr):
161 """Format the From: line to include the commenter's friendly name if given."""
162 if commenter_view:
163 site_name = settings.site_name.lower()
164 if commenter_view.email in client_config_svc.GetServiceAccountMap():
165 friendly = commenter_view.display_name
166 elif reveal_addr:
167 friendly = commenter_view.email
168 else:
169 friendly = u'%s\u2026@%s' % (
170 commenter_view.obscured_username, commenter_view.domain)
171 if '@' in sender:
172 sender_username, sender_domain = sender.split('@', 1)
173 sender = '%s+v2.%d@%s' % (
174 sender_username, commenter_view.user_id, sender_domain)
175 friendly = friendly.split('@')[0]
176 return '%s via %s <%s>' % (friendly, site_name, sender)
177 else:
178 return sender
179
180
181def NoReplyAddress(commenter_view=None, reveal_addr=False):
182 """Return an address that ignores all messages sent to it."""
183 # Note: We use "no_reply" with an underscore to avoid potential conflict
184 # with any project name. Project names cannot have underscores.
185 # Note: This does not take branded domains into account, but this address
186 # is only used for email error messages and in the reply-to address
187 # when the user is not allowed to reply.
188 sender = 'no_reply@%s' % MailDomain()
189 return FormatFriendly(commenter_view, sender, reveal_addr)
190
191
192def FormatFromAddr(project, commenter_view=None, reveal_addr=False,
193 can_reply_to=True):
194 """Return a string to be used on the email From: line.
195
196 Args:
197 project: Project PB for the project that the email is sent from.
198 commenter_view: Optional UserView of the user who made a comment. We use
199 the user's (potentially obscured) email address as their friendly name.
200 reveal_addr: Optional bool. If False then the address is obscured.
201 can_reply_to: Optional bool. If True then settings.send_email_as is used,
202 otherwise settings.send_noreply_email_as is used.
203
204 Returns:
205 A string that should be used in the From: line of outbound email
206 notifications for the given project.
207 """
208 addr_format = (settings.send_email_as_format if can_reply_to
209 else settings.send_noreply_email_as_format)
210 domain = settings.branded_domains.get(
211 project.project_name, settings.branded_domains.get('*'))
212 domain = domain or 'chromium.org'
213 if domain.count('.') > 1:
214 domain = '.'.join(domain.split('.')[-2:])
215 addr = addr_format % {'domain': domain}
216 return FormatFriendly(commenter_view, addr, reveal_addr)
217
218
219def NormalizeHeader(s):
220 """Make our message-ids robust against mail client spacing and truncation."""
221 words = _StripSubjectPrefixes(s).split() # Split on any runs of whitespace.
222 normalized = ' '.join(words)
223 truncated = normalized[:MAX_HEADER_CHARS_CONSIDERED]
224 return truncated
225
226
227def MakeMessageID(to_addr, subject, from_addr):
228 """Make a unique (but deterministic) email Message-Id: value."""
229 normalized_subject = NormalizeHeader(subject)
230 if isinstance(normalized_subject, six.text_type):
231 normalized_subject = normalized_subject.encode('utf-8')
232 mail_hmac_key = secrets_svc.GetEmailKey()
233 return '<0=%s=%s=%s@%s>' % (
234 hmac.new(mail_hmac_key, to_addr).hexdigest(),
235 hmac.new(mail_hmac_key, normalized_subject).hexdigest(),
236 from_addr.split('@')[0],
237 MailDomain())
238
239
240def GetReferences(to_addr, subject, seq_num, project_from_addr):
241 """Make a References: header to make this message thread properly.
242
243 Args:
244 to_addr: address that email message will be sent to.
245 subject: subject line of email message.
246 seq_num: sequence number of message in thread, e.g., 0, 1, 2, ...,
247 or None if the message is not part of a thread.
248 project_from_addr: address that the message will be sent from.
249
250 Returns:
251 A string Message-ID that does not correspond to any actual email
252 message that was ever sent, but it does serve to unite all the
253 messages that belong togther in a thread.
254 """
255 if seq_num is not None:
256 return MakeMessageID(to_addr, subject, project_from_addr)
257 else:
258 return ''
259
260
261def ValidateReferencesHeader(message_ref, project, from_addr, subject):
262 """Check that the References header is one that we could have sent.
263
264 Args:
265 message_ref: one of the References header values from the inbound email.
266 project: Project PB for the affected project.
267 from_addr: string email address that inbound email was sent from.
268 subject: string base subject line of inbound email.
269
270 Returns:
271 True if it looks like this is a reply to a message that we sent
272 to the same address that replied. Otherwise, False.
273 """
274 sender = '%s@%s' % (project.project_name, MailDomain())
275 expected_ref = MakeMessageID(from_addr, subject, sender)
276
277 # TODO(jrobbins): project option to not check from_addr.
278 # TODO(jrobbins): project inbound auth token.
279 return expected_ref == message_ref
280
281
282PROJECT_EMAIL_RE = re.compile(
283 r'(?P<project>[-a-z0-9]+)'
284 r'(\+(?P<verb>[a-z0-9]+)(\+(?P<label>[a-z0-9-]+))?)?'
285 r'@(?P<domain>[-a-z0-9.]+)')
286
287ISSUE_CHANGE_SUBJECT_RE = re.compile(
288 r'Issue (?P<local_id>[0-9]+) in '
289 r'(?P<project>[-a-z0-9]+): '
290 r'(?P<summary>.+)')
291
292ISSUE_CHANGE_COMPACT_SUBJECT_RE = re.compile(
293 r'(?P<project>[-a-z0-9]+):'
294 r'(?P<local_id>[0-9]+): '
295 r'(?P<summary>.+)')
296
297
298def IdentifyIssue(project_name, subject):
299 """Parse the artifact id from a reply and verify it is a valid issue.
300
301 Args:
302 project_name: string the project to search for the issue in.
303 subject: string email subject line received, it must match the one
304 sent. Leading prefixes like "Re:" should already have been stripped.
305
306 Returns:
307 An int local_id for the id of the issue. None if no id is found or the id
308 is not valid.
309 """
310
311 issue_project_name, local_id_str = _MatchSubject(subject)
312
313 if project_name != issue_project_name:
314 # Something is wrong with the project name.
315 return None
316
317 logging.info('project_name = %r', project_name)
318 logging.info('local_id_str = %r', local_id_str)
319
320 try:
321 local_id = int(local_id_str)
322 except (ValueError, TypeError):
323 local_id = None
324
325 return local_id
326
327
328def IdentifyProjectVerbAndLabel(project_addr):
329 # Ignore any inbound email sent to a "no_reply@" address.
330 if project_addr.startswith('no_reply@'):
331 return None, None, None
332
333 project_name = None
334 verb = None
335 label = None
336 m = PROJECT_EMAIL_RE.match(project_addr.lower())
337 if m:
338 project_name = m.group('project')
339 verb = m.group('verb')
340 label = m.group('label')
341
342 return project_name, verb, label
343
344
345def _MatchSubject(subject):
346 """Parse the project, artifact type, and artifact id from a subject line."""
347 m = (ISSUE_CHANGE_SUBJECT_RE.match(subject) or
348 ISSUE_CHANGE_COMPACT_SUBJECT_RE.match(subject))
349 if m:
350 return m.group('project'), m.group('local_id')
351
352 return None, None
353
354
355# TODO(jrobbins): For now, we strip out lines that look like quoted
356# text and then will give the user the option to see the whole email.
357# For 2.0 of this feature, we should change the Comment PB to have
358# runs of text with different properties so that the UI can present
359# "- Show quoted text -" and expand it in-line.
360
361# TODO(jrobbins): For now, we look for lines that indicate quoted
362# text (e.g., they start with ">"). But, we should also collapse
363# multiple lines that are identical to other lines in previous
364# non-deleted comments on the same issue, regardless of quote markers.
365
366
367# We cut off the message if we see something that looks like a signature and
368# it is near the bottom of the message.
369SIGNATURE_BOUNDARY_RE = re.compile(
370 r'^(([-_=]+ ?)+|'
371 r'cheers|(best |warm |kind )?regards|thx|thanks|thank you|'
372 r'Sent from my i?Phone|Sent from my iPod)'
373 r',? *$', re.I)
374
375MAX_SIGNATURE_LINES = 8
376
377FORWARD_OR_EXPLICIT_SIG_PATS = [
378 r'[^0-9a-z]+(forwarded|original) message[^0-9a-z]+\s*$',
379 r'Updates:\s*$',
380 r'Comment #\d+ on issue \d+ by \S+:',
381 # If we see this anywhere in the message, treat the rest as a signature.
382 r'--\s*$',
383 ]
384FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE = re.compile(
385 r'^(%s)(.|\n)*' % '|'.join(FORWARD_OR_EXPLICIT_SIG_PATS),
386 flags=re.MULTILINE | re.IGNORECASE)
387
388# This handles gmail well, and it's pretty broad without seeming like
389# it would cause false positives.
390QUOTE_PATS = [
391 r'^On .*\s+<\s*\S+?@[-a-z0-9.]+>\s*wrote:\s*$',
392 r'^On .* \S+?@[-a-z0-9.]+\s*wrote:\s*$',
393 r'^\S+?@[-a-z0-9.]+ \(\S+?@[-a-z0-9.]+\)\s*wrote:\s*$',
394 r'\S+?@[-a-z0-9]+.appspotmail.com\s.*wrote:\s*$',
395 r'\S+?@[-a-z0-9]+.appspotmail.com\s+.*a\s+\xc3\xa9crit\s*:\s*$',
396 r'^\d+/\d+/\d+ +<\S+@[-a-z0-9.]+>:?\s*$',
397 r'^>.*$',
398 ]
399QUOTED_BLOCKS_RE = re.compile(
400 r'(^\s*\n)*((%s)\n?)+(^\s*\n)*' % '|'.join(QUOTE_PATS),
401 flags=re.MULTILINE | re.IGNORECASE)
402
403
404def StripQuotedText(description):
405 """Strip all quoted text lines out of the given comment text."""
406 # If the rest of message is forwared text, we're done.
407 description = FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE.sub('', description)
408 # Replace each quoted block of lines and surrounding blank lines with at
409 # most one blank line.
410 description = QUOTED_BLOCKS_RE.sub('\n', description)
411
412 new_lines = description.strip().split('\n')
413 # Make another pass over the last few lines to strip out signatures.
414 sig_zone_start = max(0, len(new_lines) - MAX_SIGNATURE_LINES)
415 for idx in range(sig_zone_start, len(new_lines)):
416 line = new_lines[idx]
417 if SIGNATURE_BOUNDARY_RE.match(line):
418 # We found the likely start of a signature, just keep the lines above it.
419 new_lines = new_lines[:idx]
420 break
421
422 return '\n'.join(new_lines).strip()