Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 1 | # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style |
| 3 | # license that can be found in the LICENSE file or at |
| 4 | # https://developers.google.com/open-source/licenses/bsd |
| 5 | |
| 6 | """Functions that format or parse email messages in Monorail. |
| 7 | |
| 8 | Specifically, this module has the logic for generating various email |
| 9 | header lines that help match inbound and outbound email to the project |
| 10 | and artifact that generated it. |
| 11 | """ |
| 12 | from __future__ import print_function |
| 13 | from __future__ import division |
| 14 | from __future__ import absolute_import |
| 15 | |
| 16 | import hmac |
| 17 | import logging |
| 18 | import re |
| 19 | import rfc822 |
| 20 | |
| 21 | import six |
| 22 | |
| 23 | from google.appengine.api import app_identity |
| 24 | |
| 25 | import settings |
| 26 | from framework import framework_constants |
| 27 | from services import client_config_svc |
| 28 | from services import secrets_svc |
| 29 | |
| 30 | # TODO(jrobbins): Parsing very large messages is slow, and we are not going |
| 31 | # to handle attachments at first, so there is no reason to consider large |
| 32 | # emails. |
| 33 | MAX_BODY_SIZE = 100 * 1024 |
| 34 | MAX_HEADER_CHARS_CONSIDERED = 255 |
| 35 | |
| 36 | |
| 37 | def _checkEmailHeaderPrefix(key): |
| 38 | """Ensures that a given email header starts with X-Alert2Monorail prefix.""" |
| 39 | # this is to catch typos in the email header prefix and raises an exception |
| 40 | # during package loading time. |
| 41 | assert key.startswith('X-Alert2Monorail') |
| 42 | return key |
| 43 | |
| 44 | |
| 45 | class AlertEmailHeader(object): |
| 46 | """A list of the email header keys supported by Alert2Monorail.""" |
| 47 | # pylint: disable=bad-whitespace |
| 48 | # |
| 49 | # The prefix has been hard-coded without string substitution to make them |
| 50 | # searchable with the header keys. |
| 51 | INCIDENT_ID = 'X-Incident-Id' |
| 52 | OWNER = _checkEmailHeaderPrefix('X-Alert2Monorail-owner') |
| 53 | CC = _checkEmailHeaderPrefix('X-Alert2Monorail-cc') |
| 54 | PRIORITY = _checkEmailHeaderPrefix('X-Alert2Monorail-priority') |
| 55 | STATUS = _checkEmailHeaderPrefix('X-Alert2Monorail-status') |
| 56 | COMPONENT = _checkEmailHeaderPrefix('X-Alert2Monorail-component') |
| 57 | OS = _checkEmailHeaderPrefix('X-Alert2Monorail-os') |
| 58 | TYPE = _checkEmailHeaderPrefix('X-Alert2Monorail-type') |
| 59 | LABEL = _checkEmailHeaderPrefix('X-Alert2Monorail-label') |
| 60 | |
| 61 | |
| 62 | def IsBodyTooBigToParse(body): |
| 63 | """Return True if the email message body is too big to process.""" |
| 64 | return len(body) > MAX_BODY_SIZE |
| 65 | |
| 66 | |
| 67 | def IsProjectAddressOnToLine(project_addr, to_addrs): |
| 68 | """Return True if an email was explicitly sent directly to us.""" |
| 69 | return project_addr in to_addrs |
| 70 | |
| 71 | |
| 72 | def ParseEmailMessage(msg): |
| 73 | """Parse the given MessageRouterMessage and return relevant fields. |
| 74 | |
| 75 | Args: |
| 76 | msg: email.message.Message object for the email message sent to us. |
| 77 | |
| 78 | Returns: |
| 79 | A tuple: from_addr, to_addrs, cc_addrs, references, |
| 80 | incident_id, subject, body. |
| 81 | """ |
| 82 | # Ignore messages that are probably not from humans, see: |
| 83 | # http://google.com/search?q=precedence+bulk+junk |
| 84 | precedence = msg.get('precedence', '') |
| 85 | if precedence.lower() in ['bulk', 'junk']: |
| 86 | logging.info('Precedence: %r indicates an autoresponder', precedence) |
| 87 | return '', [], [], '', '', '', '' |
| 88 | |
| 89 | from_addrs = _ExtractAddrs(msg.get('from', '')) |
| 90 | if from_addrs: |
| 91 | from_addr = from_addrs[0] |
| 92 | else: |
| 93 | from_addr = '' |
| 94 | |
| 95 | to_addrs = _ExtractAddrs(msg.get('to', '')) |
| 96 | cc_addrs = _ExtractAddrs(msg.get('cc', '')) |
| 97 | |
| 98 | in_reply_to = msg.get('in-reply-to', '') |
| 99 | incident_id = msg.get(AlertEmailHeader.INCIDENT_ID, '') |
| 100 | references = msg.get('references', '').split() |
| 101 | references = list({ref for ref in [in_reply_to] + references if ref}) |
| 102 | subject = _StripSubjectPrefixes(msg.get('subject', '')) |
| 103 | |
| 104 | body = u'' |
| 105 | for part in msg.walk(): |
| 106 | # We only process plain text emails. |
| 107 | if part.get_content_type() == 'text/plain': |
| 108 | body = part.get_payload(decode=True) |
| 109 | if not isinstance(body, six.text_type): |
| 110 | body = body.decode('utf-8') |
| 111 | break # Only consider the first text part. |
| 112 | |
| 113 | return (from_addr, to_addrs, cc_addrs, references, incident_id, subject, |
| 114 | body) |
| 115 | |
| 116 | |
| 117 | def _ExtractAddrs(header_value): |
| 118 | """Given a message header value, return email address found there.""" |
| 119 | friendly_addr_pairs = list(rfc822.AddressList(header_value)) |
| 120 | return [addr for _friendly, addr in friendly_addr_pairs] |
| 121 | |
| 122 | |
| 123 | def _StripSubjectPrefixes(subject): |
| 124 | """Strip off any 'Re:', 'Fwd:', etc. subject line prefixes.""" |
| 125 | prefix = _FindSubjectPrefix(subject) |
| 126 | while prefix: |
| 127 | subject = subject[len(prefix):].strip() |
| 128 | prefix = _FindSubjectPrefix(subject) |
| 129 | |
| 130 | return subject |
| 131 | |
| 132 | |
| 133 | def _FindSubjectPrefix(subject): |
| 134 | """If the given subject starts with a prefix, return that prefix.""" |
| 135 | for prefix in ['re:', 'aw:', 'fwd:', 'fw:']: |
| 136 | if subject.lower().startswith(prefix): |
| 137 | return prefix |
| 138 | |
| 139 | return None |
| 140 | |
| 141 | |
| 142 | def MailDomain(): |
| 143 | """Return the domain name where this app can recieve email.""" |
| 144 | if settings.unit_test_mode: |
| 145 | return 'testbed-test.appspotmail.com' |
| 146 | |
| 147 | # If running on a GAFYD domain, you must define an app alias on the |
| 148 | # Application Settings admin web page. If you cannot reserve the matching |
| 149 | # APP_ID for the alias, then specify it in settings.mail_domain. |
| 150 | if settings.mail_domain: |
| 151 | return settings.mail_domain |
| 152 | |
| 153 | app_id = app_identity.get_application_id() |
| 154 | if ':' in app_id: |
| 155 | app_id = app_id.split(':')[-1] |
| 156 | |
| 157 | return '%s.appspotmail.com' % app_id |
| 158 | |
| 159 | |
| 160 | def FormatFriendly(commenter_view, sender, reveal_addr): |
| 161 | """Format the From: line to include the commenter's friendly name if given.""" |
| 162 | if commenter_view: |
| 163 | site_name = settings.site_name.lower() |
| 164 | if commenter_view.email in client_config_svc.GetServiceAccountMap(): |
| 165 | friendly = commenter_view.display_name |
| 166 | elif reveal_addr: |
| 167 | friendly = commenter_view.email |
| 168 | else: |
| 169 | friendly = u'%s\u2026@%s' % ( |
| 170 | commenter_view.obscured_username, commenter_view.domain) |
| 171 | if '@' in sender: |
| 172 | sender_username, sender_domain = sender.split('@', 1) |
| 173 | sender = '%s+v2.%d@%s' % ( |
| 174 | sender_username, commenter_view.user_id, sender_domain) |
| 175 | friendly = friendly.split('@')[0] |
| 176 | return '%s via %s <%s>' % (friendly, site_name, sender) |
| 177 | else: |
| 178 | return sender |
| 179 | |
| 180 | |
| 181 | def NoReplyAddress(commenter_view=None, reveal_addr=False): |
| 182 | """Return an address that ignores all messages sent to it.""" |
| 183 | # Note: We use "no_reply" with an underscore to avoid potential conflict |
| 184 | # with any project name. Project names cannot have underscores. |
| 185 | # Note: This does not take branded domains into account, but this address |
| 186 | # is only used for email error messages and in the reply-to address |
| 187 | # when the user is not allowed to reply. |
| 188 | sender = 'no_reply@%s' % MailDomain() |
| 189 | return FormatFriendly(commenter_view, sender, reveal_addr) |
| 190 | |
| 191 | |
| 192 | def FormatFromAddr(project, commenter_view=None, reveal_addr=False, |
| 193 | can_reply_to=True): |
| 194 | """Return a string to be used on the email From: line. |
| 195 | |
| 196 | Args: |
| 197 | project: Project PB for the project that the email is sent from. |
| 198 | commenter_view: Optional UserView of the user who made a comment. We use |
| 199 | the user's (potentially obscured) email address as their friendly name. |
| 200 | reveal_addr: Optional bool. If False then the address is obscured. |
| 201 | can_reply_to: Optional bool. If True then settings.send_email_as is used, |
| 202 | otherwise settings.send_noreply_email_as is used. |
| 203 | |
| 204 | Returns: |
| 205 | A string that should be used in the From: line of outbound email |
| 206 | notifications for the given project. |
| 207 | """ |
| 208 | addr_format = (settings.send_email_as_format if can_reply_to |
| 209 | else settings.send_noreply_email_as_format) |
| 210 | domain = settings.branded_domains.get( |
| 211 | project.project_name, settings.branded_domains.get('*')) |
| 212 | domain = domain or 'chromium.org' |
| 213 | if domain.count('.') > 1: |
| 214 | domain = '.'.join(domain.split('.')[-2:]) |
| 215 | addr = addr_format % {'domain': domain} |
| 216 | return FormatFriendly(commenter_view, addr, reveal_addr) |
| 217 | |
| 218 | |
| 219 | def NormalizeHeader(s): |
| 220 | """Make our message-ids robust against mail client spacing and truncation.""" |
| 221 | words = _StripSubjectPrefixes(s).split() # Split on any runs of whitespace. |
| 222 | normalized = ' '.join(words) |
| 223 | truncated = normalized[:MAX_HEADER_CHARS_CONSIDERED] |
| 224 | return truncated |
| 225 | |
| 226 | |
| 227 | def MakeMessageID(to_addr, subject, from_addr): |
| 228 | """Make a unique (but deterministic) email Message-Id: value.""" |
| 229 | normalized_subject = NormalizeHeader(subject) |
| 230 | if isinstance(normalized_subject, six.text_type): |
| 231 | normalized_subject = normalized_subject.encode('utf-8') |
| 232 | mail_hmac_key = secrets_svc.GetEmailKey() |
| 233 | return '<0=%s=%s=%s@%s>' % ( |
| 234 | hmac.new(mail_hmac_key, to_addr).hexdigest(), |
| 235 | hmac.new(mail_hmac_key, normalized_subject).hexdigest(), |
| 236 | from_addr.split('@')[0], |
| 237 | MailDomain()) |
| 238 | |
| 239 | |
| 240 | def GetReferences(to_addr, subject, seq_num, project_from_addr): |
| 241 | """Make a References: header to make this message thread properly. |
| 242 | |
| 243 | Args: |
| 244 | to_addr: address that email message will be sent to. |
| 245 | subject: subject line of email message. |
| 246 | seq_num: sequence number of message in thread, e.g., 0, 1, 2, ..., |
| 247 | or None if the message is not part of a thread. |
| 248 | project_from_addr: address that the message will be sent from. |
| 249 | |
| 250 | Returns: |
| 251 | A string Message-ID that does not correspond to any actual email |
| 252 | message that was ever sent, but it does serve to unite all the |
| 253 | messages that belong togther in a thread. |
| 254 | """ |
| 255 | if seq_num is not None: |
| 256 | return MakeMessageID(to_addr, subject, project_from_addr) |
| 257 | else: |
| 258 | return '' |
| 259 | |
| 260 | |
| 261 | def ValidateReferencesHeader(message_ref, project, from_addr, subject): |
| 262 | """Check that the References header is one that we could have sent. |
| 263 | |
| 264 | Args: |
| 265 | message_ref: one of the References header values from the inbound email. |
| 266 | project: Project PB for the affected project. |
| 267 | from_addr: string email address that inbound email was sent from. |
| 268 | subject: string base subject line of inbound email. |
| 269 | |
| 270 | Returns: |
| 271 | True if it looks like this is a reply to a message that we sent |
| 272 | to the same address that replied. Otherwise, False. |
| 273 | """ |
| 274 | sender = '%s@%s' % (project.project_name, MailDomain()) |
| 275 | expected_ref = MakeMessageID(from_addr, subject, sender) |
| 276 | |
| 277 | # TODO(jrobbins): project option to not check from_addr. |
| 278 | # TODO(jrobbins): project inbound auth token. |
| 279 | return expected_ref == message_ref |
| 280 | |
| 281 | |
| 282 | PROJECT_EMAIL_RE = re.compile( |
| 283 | r'(?P<project>[-a-z0-9]+)' |
| 284 | r'(\+(?P<verb>[a-z0-9]+)(\+(?P<label>[a-z0-9-]+))?)?' |
| 285 | r'@(?P<domain>[-a-z0-9.]+)') |
| 286 | |
| 287 | ISSUE_CHANGE_SUBJECT_RE = re.compile( |
| 288 | r'Issue (?P<local_id>[0-9]+) in ' |
| 289 | r'(?P<project>[-a-z0-9]+): ' |
| 290 | r'(?P<summary>.+)') |
| 291 | |
| 292 | ISSUE_CHANGE_COMPACT_SUBJECT_RE = re.compile( |
| 293 | r'(?P<project>[-a-z0-9]+):' |
| 294 | r'(?P<local_id>[0-9]+): ' |
| 295 | r'(?P<summary>.+)') |
| 296 | |
| 297 | |
| 298 | def IdentifyIssue(project_name, subject): |
| 299 | """Parse the artifact id from a reply and verify it is a valid issue. |
| 300 | |
| 301 | Args: |
| 302 | project_name: string the project to search for the issue in. |
| 303 | subject: string email subject line received, it must match the one |
| 304 | sent. Leading prefixes like "Re:" should already have been stripped. |
| 305 | |
| 306 | Returns: |
| 307 | An int local_id for the id of the issue. None if no id is found or the id |
| 308 | is not valid. |
| 309 | """ |
| 310 | |
| 311 | issue_project_name, local_id_str = _MatchSubject(subject) |
| 312 | |
| 313 | if project_name != issue_project_name: |
| 314 | # Something is wrong with the project name. |
| 315 | return None |
| 316 | |
| 317 | logging.info('project_name = %r', project_name) |
| 318 | logging.info('local_id_str = %r', local_id_str) |
| 319 | |
| 320 | try: |
| 321 | local_id = int(local_id_str) |
| 322 | except (ValueError, TypeError): |
| 323 | local_id = None |
| 324 | |
| 325 | return local_id |
| 326 | |
| 327 | |
| 328 | def IdentifyProjectVerbAndLabel(project_addr): |
| 329 | # Ignore any inbound email sent to a "no_reply@" address. |
| 330 | if project_addr.startswith('no_reply@'): |
| 331 | return None, None, None |
| 332 | |
| 333 | project_name = None |
| 334 | verb = None |
| 335 | label = None |
| 336 | m = PROJECT_EMAIL_RE.match(project_addr.lower()) |
| 337 | if m: |
| 338 | project_name = m.group('project') |
| 339 | verb = m.group('verb') |
| 340 | label = m.group('label') |
| 341 | |
| 342 | return project_name, verb, label |
| 343 | |
| 344 | |
| 345 | def _MatchSubject(subject): |
| 346 | """Parse the project, artifact type, and artifact id from a subject line.""" |
| 347 | m = (ISSUE_CHANGE_SUBJECT_RE.match(subject) or |
| 348 | ISSUE_CHANGE_COMPACT_SUBJECT_RE.match(subject)) |
| 349 | if m: |
| 350 | return m.group('project'), m.group('local_id') |
| 351 | |
| 352 | return None, None |
| 353 | |
| 354 | |
| 355 | # TODO(jrobbins): For now, we strip out lines that look like quoted |
| 356 | # text and then will give the user the option to see the whole email. |
| 357 | # For 2.0 of this feature, we should change the Comment PB to have |
| 358 | # runs of text with different properties so that the UI can present |
| 359 | # "- Show quoted text -" and expand it in-line. |
| 360 | |
| 361 | # TODO(jrobbins): For now, we look for lines that indicate quoted |
| 362 | # text (e.g., they start with ">"). But, we should also collapse |
| 363 | # multiple lines that are identical to other lines in previous |
| 364 | # non-deleted comments on the same issue, regardless of quote markers. |
| 365 | |
| 366 | |
| 367 | # We cut off the message if we see something that looks like a signature and |
| 368 | # it is near the bottom of the message. |
| 369 | SIGNATURE_BOUNDARY_RE = re.compile( |
| 370 | r'^(([-_=]+ ?)+|' |
| 371 | r'cheers|(best |warm |kind )?regards|thx|thanks|thank you|' |
| 372 | r'Sent from my i?Phone|Sent from my iPod)' |
| 373 | r',? *$', re.I) |
| 374 | |
| 375 | MAX_SIGNATURE_LINES = 8 |
| 376 | |
| 377 | FORWARD_OR_EXPLICIT_SIG_PATS = [ |
| 378 | r'[^0-9a-z]+(forwarded|original) message[^0-9a-z]+\s*$', |
| 379 | r'Updates:\s*$', |
| 380 | r'Comment #\d+ on issue \d+ by \S+:', |
| 381 | # If we see this anywhere in the message, treat the rest as a signature. |
| 382 | r'--\s*$', |
| 383 | ] |
| 384 | FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE = re.compile( |
| 385 | r'^(%s)(.|\n)*' % '|'.join(FORWARD_OR_EXPLICIT_SIG_PATS), |
| 386 | flags=re.MULTILINE | re.IGNORECASE) |
| 387 | |
| 388 | # This handles gmail well, and it's pretty broad without seeming like |
| 389 | # it would cause false positives. |
| 390 | QUOTE_PATS = [ |
| 391 | r'^On .*\s+<\s*\S+?@[-a-z0-9.]+>\s*wrote:\s*$', |
| 392 | r'^On .* \S+?@[-a-z0-9.]+\s*wrote:\s*$', |
| 393 | r'^\S+?@[-a-z0-9.]+ \(\S+?@[-a-z0-9.]+\)\s*wrote:\s*$', |
| 394 | r'\S+?@[-a-z0-9]+.appspotmail.com\s.*wrote:\s*$', |
| 395 | r'\S+?@[-a-z0-9]+.appspotmail.com\s+.*a\s+\xc3\xa9crit\s*:\s*$', |
| 396 | r'^\d+/\d+/\d+ +<\S+@[-a-z0-9.]+>:?\s*$', |
| 397 | r'^>.*$', |
| 398 | ] |
| 399 | QUOTED_BLOCKS_RE = re.compile( |
| 400 | r'(^\s*\n)*((%s)\n?)+(^\s*\n)*' % '|'.join(QUOTE_PATS), |
| 401 | flags=re.MULTILINE | re.IGNORECASE) |
| 402 | |
| 403 | |
| 404 | def StripQuotedText(description): |
| 405 | """Strip all quoted text lines out of the given comment text.""" |
| 406 | # If the rest of message is forwared text, we're done. |
| 407 | description = FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE.sub('', description) |
| 408 | # Replace each quoted block of lines and surrounding blank lines with at |
| 409 | # most one blank line. |
| 410 | description = QUOTED_BLOCKS_RE.sub('\n', description) |
| 411 | |
| 412 | new_lines = description.strip().split('\n') |
| 413 | # Make another pass over the last few lines to strip out signatures. |
| 414 | sig_zone_start = max(0, len(new_lines) - MAX_SIGNATURE_LINES) |
| 415 | for idx in range(sig_zone_start, len(new_lines)): |
| 416 | line = new_lines[idx] |
| 417 | if SIGNATURE_BOUNDARY_RE.match(line): |
| 418 | # We found the likely start of a signature, just keep the lines above it. |
| 419 | new_lines = new_lines[:idx] |
| 420 | break |
| 421 | |
| 422 | return '\n'.join(new_lines).strip() |