| # Copyright 2016 The Chromium Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Functions that format or parse email messages in Monorail. |
| |
| Specifically, this module has the logic for generating various email |
| header lines that help match inbound and outbound email to the project |
| and artifact that generated it. |
| """ |
| from __future__ import print_function |
| from __future__ import division |
| from __future__ import absolute_import |
| |
| import hashlib |
| import hmac |
| import logging |
| import re |
| |
| import six |
| |
| if six.PY2: |
| import rfc822 |
| else: |
| import email.utils |
| |
| from google.appengine.api import app_identity |
| |
| import settings |
| from framework import framework_constants |
| from services import client_config_svc |
| from services import secrets_svc |
| |
| # TODO(jrobbins): Parsing very large messages is slow, and we are not going |
| # to handle attachments at first, so there is no reason to consider large |
| # emails. |
| MAX_BODY_SIZE = 100 * 1024 |
| MAX_HEADER_CHARS_CONSIDERED = 255 |
| |
| |
| def _checkEmailHeaderPrefix(key): |
| """Ensures that a given email header starts with X-Alert2Monorail prefix.""" |
| # this is to catch typos in the email header prefix and raises an exception |
| # during package loading time. |
| assert key.startswith('X-Alert2Monorail') |
| return key |
| |
| |
| class AlertEmailHeader(object): |
| """A list of the email header keys supported by Alert2Monorail.""" |
| # pylint: disable=bad-whitespace |
| # |
| # The prefix has been hard-coded without string substitution to make them |
| # searchable with the header keys. |
| INCIDENT_ID = 'X-Incident-Id' |
| OWNER = _checkEmailHeaderPrefix('X-Alert2Monorail-owner') |
| CC = _checkEmailHeaderPrefix('X-Alert2Monorail-cc') |
| PRIORITY = _checkEmailHeaderPrefix('X-Alert2Monorail-priority') |
| STATUS = _checkEmailHeaderPrefix('X-Alert2Monorail-status') |
| COMPONENT = _checkEmailHeaderPrefix('X-Alert2Monorail-component') |
| OS = _checkEmailHeaderPrefix('X-Alert2Monorail-os') |
| TYPE = _checkEmailHeaderPrefix('X-Alert2Monorail-type') |
| LABEL = _checkEmailHeaderPrefix('X-Alert2Monorail-label') |
| |
| |
| def IsBodyTooBigToParse(body): |
| """Return True if the email message body is too big to process.""" |
| return len(body) > MAX_BODY_SIZE |
| |
| |
| def IsProjectAddressOnToLine(project_addr, to_addrs): |
| """Return True if an email was explicitly sent directly to us.""" |
| return project_addr in to_addrs |
| |
| |
| def ParseEmailMessage(msg): |
| """Parse the given MessageRouterMessage and return relevant fields. |
| |
| Args: |
| msg: email.message.Message object for the email message sent to us. |
| |
| Returns: |
| A tuple: from_addr, to_addrs, cc_addrs, references, |
| incident_id, subject, body. |
| """ |
| # Ignore messages that are probably not from humans, see: |
| # http://google.com/search?q=precedence+bulk+junk |
| precedence = msg.get('precedence', '') |
| if precedence.lower() in ['bulk', 'junk']: |
| logging.info('Precedence: %r indicates an autoresponder', precedence) |
| return '', [], [], '', '', '', '' |
| |
| from_addrs = _ExtractAddrs(msg.get('from', '')) |
| if from_addrs: |
| from_addr = from_addrs[0] |
| else: |
| from_addr = '' |
| |
| to_addrs = _ExtractAddrs(msg.get('to', '')) |
| cc_addrs = _ExtractAddrs(msg.get('cc', '')) |
| |
| in_reply_to = msg.get('in-reply-to', '') |
| incident_id = msg.get(AlertEmailHeader.INCIDENT_ID, '') |
| references = msg.get('references', '').split() |
| references = list({ref for ref in [in_reply_to] + references if ref}) |
| subject = _StripSubjectPrefixes(msg.get('subject', '')) |
| |
| body = u'' |
| for part in msg.walk(): |
| # We only process plain text emails. |
| if part.get_content_type() == 'text/plain': |
| body = part.get_payload(decode=True) |
| if not isinstance(body, six.text_type): |
| body = body.decode('utf-8') |
| break # Only consider the first text part. |
| |
| return (from_addr, to_addrs, cc_addrs, references, incident_id, subject, |
| body) |
| |
| |
| def _ExtractAddrs(header_value): |
| """Given a message header value, return email address found there.""" |
| if six.PY2: |
| friendly_addr_pairs = list(rfc822.AddressList(header_value)) |
| else: |
| friendly_addr_pairs = email.utils.getaddresses([header_value]) |
| return [addr for _friendly, addr in friendly_addr_pairs] |
| |
| |
| def _StripSubjectPrefixes(subject): |
| """Strip off any 'Re:', 'Fwd:', etc. subject line prefixes.""" |
| prefix = _FindSubjectPrefix(subject) |
| while prefix: |
| subject = subject[len(prefix):].strip() |
| prefix = _FindSubjectPrefix(subject) |
| |
| return subject |
| |
| |
| def _FindSubjectPrefix(subject): |
| """If the given subject starts with a prefix, return that prefix.""" |
| for prefix in ['re:', 'aw:', 'fwd:', 'fw:']: |
| if subject.lower().startswith(prefix): |
| return prefix |
| |
| return None |
| |
| |
| def MailDomain(): |
| """Return the domain name where this app can recieve email.""" |
| if settings.unit_test_mode: |
| return 'testbed-test.appspotmail.com' |
| |
| # If running on a GAFYD domain, you must define an app alias on the |
| # Application Settings admin web page. If you cannot reserve the matching |
| # APP_ID for the alias, then specify it in settings.mail_domain. |
| if settings.mail_domain: |
| return settings.mail_domain |
| |
| app_id = app_identity.get_application_id() |
| if ':' in app_id: |
| app_id = app_id.split(':')[-1] |
| |
| return '%s.appspotmail.com' % app_id |
| |
| |
| def FormatFriendly(commenter_view, sender, reveal_addr): |
| """Format the From: line to include the commenter's friendly name if given.""" |
| if commenter_view: |
| site_name = settings.site_name.lower() |
| if commenter_view.email in client_config_svc.GetServiceAccountMap(): |
| friendly = commenter_view.display_name |
| elif reveal_addr: |
| friendly = commenter_view.email |
| else: |
| friendly = u'%s\u2026@%s' % ( |
| commenter_view.obscured_username, commenter_view.domain) |
| if '@' in sender: |
| sender_username, sender_domain = sender.split('@', 1) |
| sender = '%s+v2.%d@%s' % ( |
| sender_username, commenter_view.user_id, sender_domain) |
| friendly = friendly.split('@')[0] |
| return '%s via %s <%s>' % (friendly, site_name, sender) |
| else: |
| return sender |
| |
| |
| def NoReplyAddress(commenter_view=None, reveal_addr=False): |
| """Return an address that ignores all messages sent to it.""" |
| # Note: We use "no_reply" with an underscore to avoid potential conflict |
| # with any project name. Project names cannot have underscores. |
| # Note: This does not take branded domains into account, but this address |
| # is only used for email error messages and in the reply-to address |
| # when the user is not allowed to reply. |
| sender = 'no_reply@%s' % MailDomain() |
| return FormatFriendly(commenter_view, sender, reveal_addr) |
| |
| |
| def FormatFromAddr(project, commenter_view=None, reveal_addr=False, |
| can_reply_to=True): |
| """Return a string to be used on the email From: line. |
| |
| Args: |
| project: Project PB for the project that the email is sent from. |
| commenter_view: Optional UserView of the user who made a comment. We use |
| the user's (potentially obscured) email address as their friendly name. |
| reveal_addr: Optional bool. If False then the address is obscured. |
| can_reply_to: Optional bool. If True then settings.send_email_as is used, |
| otherwise settings.send_noreply_email_as is used. |
| |
| Returns: |
| A string that should be used in the From: line of outbound email |
| notifications for the given project. |
| """ |
| addr_format = (settings.send_email_as_format if can_reply_to |
| else settings.send_noreply_email_as_format) |
| domain = settings.branded_domains.get( |
| project.project_name, settings.branded_domains.get('*')) |
| domain = domain or 'chromium.org' |
| if domain.count('.') > 1: |
| domain = '.'.join(domain.split('.')[-2:]) |
| addr = addr_format % {'domain': domain} |
| return FormatFriendly(commenter_view, addr, reveal_addr) |
| |
| |
| def NormalizeHeader(s): |
| """Make our message-ids robust against mail client spacing and truncation.""" |
| words = _StripSubjectPrefixes(s).split() # Split on any runs of whitespace. |
| normalized = ' '.join(words) |
| truncated = normalized[:MAX_HEADER_CHARS_CONSIDERED] |
| return truncated |
| |
| |
| def MakeMessageID(to_addr, subject, from_addr): |
| """Make a unique (but deterministic) email Message-Id: value.""" |
| normalized_subject = NormalizeHeader(subject) |
| if isinstance(normalized_subject, six.text_type): |
| normalized_subject = normalized_subject.encode('utf-8') |
| mail_hmac_key = secrets_svc.GetEmailKey() |
| to_addr_hash = hmac.new( |
| mail_hmac_key, six.ensure_binary(to_addr), |
| digestmod=hashlib.md5).hexdigest() |
| subject_hash = hmac.new( |
| mail_hmac_key, |
| six.ensure_binary(normalized_subject), |
| digestmod=hashlib.md5).hexdigest() |
| return '<0=%s=%s=%s@%s>' % ( |
| to_addr_hash, subject_hash, from_addr.split('@')[0], MailDomain()) |
| |
| |
| def GetReferences(to_addr, subject, seq_num, project_from_addr): |
| """Make a References: header to make this message thread properly. |
| |
| Args: |
| to_addr: address that email message will be sent to. |
| subject: subject line of email message. |
| seq_num: sequence number of message in thread, e.g., 0, 1, 2, ..., |
| or None if the message is not part of a thread. |
| project_from_addr: address that the message will be sent from. |
| |
| Returns: |
| A string Message-ID that does not correspond to any actual email |
| message that was ever sent, but it does serve to unite all the |
| messages that belong togther in a thread. |
| """ |
| if seq_num is not None: |
| return MakeMessageID(to_addr, subject, project_from_addr) |
| else: |
| return '' |
| |
| |
| def ValidateReferencesHeader(message_ref, project, from_addr, subject): |
| """Check that the References header is one that we could have sent. |
| |
| Args: |
| message_ref: one of the References header values from the inbound email. |
| project: Project PB for the affected project. |
| from_addr: string email address that inbound email was sent from. |
| subject: string base subject line of inbound email. |
| |
| Returns: |
| True if it looks like this is a reply to a message that we sent |
| to the same address that replied. Otherwise, False. |
| """ |
| sender = '%s@%s' % (project.project_name, MailDomain()) |
| expected_ref = MakeMessageID(from_addr, subject, sender) |
| |
| # TODO(jrobbins): project option to not check from_addr. |
| # TODO(jrobbins): project inbound auth token. |
| return expected_ref == message_ref |
| |
| |
| PROJECT_EMAIL_RE = re.compile( |
| r'(?P<project>[-a-z0-9]+)' |
| r'(\+(?P<verb>[a-z0-9]+)(\+(?P<label>[a-z0-9-]+))?)?' |
| r'@(?P<domain>[-a-z0-9.]+)') |
| |
| ISSUE_CHANGE_SUBJECT_RE = re.compile( |
| r'Issue (?P<local_id>[0-9]+) in ' |
| r'(?P<project>[-a-z0-9]+): ' |
| r'(?P<summary>.+)') |
| |
| ISSUE_CHANGE_COMPACT_SUBJECT_RE = re.compile( |
| r'(?P<project>[-a-z0-9]+):' |
| r'(?P<local_id>[0-9]+): ' |
| r'(?P<summary>.+)') |
| |
| |
| def IdentifyIssue(project_name, subject): |
| """Parse the artifact id from a reply and verify it is a valid issue. |
| |
| Args: |
| project_name: string the project to search for the issue in. |
| subject: string email subject line received, it must match the one |
| sent. Leading prefixes like "Re:" should already have been stripped. |
| |
| Returns: |
| An int local_id for the id of the issue. None if no id is found or the id |
| is not valid. |
| """ |
| |
| issue_project_name, local_id_str = _MatchSubject(subject) |
| |
| if project_name != issue_project_name: |
| # Something is wrong with the project name. |
| return None |
| |
| logging.info('project_name = %r', project_name) |
| logging.info('local_id_str = %r', local_id_str) |
| |
| try: |
| local_id = int(local_id_str) |
| except (ValueError, TypeError): |
| local_id = None |
| |
| return local_id |
| |
| |
| def IdentifyProjectVerbAndLabel(project_addr): |
| # Ignore any inbound email sent to a "no_reply@" address. |
| if project_addr.startswith('no_reply@'): |
| return None, None, None |
| |
| project_name = None |
| verb = None |
| label = None |
| m = PROJECT_EMAIL_RE.match(project_addr.lower()) |
| if m: |
| project_name = m.group('project') |
| verb = m.group('verb') |
| label = m.group('label') |
| |
| return project_name, verb, label |
| |
| |
| def _MatchSubject(subject): |
| """Parse the project, artifact type, and artifact id from a subject line.""" |
| m = (ISSUE_CHANGE_SUBJECT_RE.match(subject) or |
| ISSUE_CHANGE_COMPACT_SUBJECT_RE.match(subject)) |
| if m: |
| return m.group('project'), m.group('local_id') |
| |
| return None, None |
| |
| |
| # TODO(jrobbins): For now, we strip out lines that look like quoted |
| # text and then will give the user the option to see the whole email. |
| # For 2.0 of this feature, we should change the Comment PB to have |
| # runs of text with different properties so that the UI can present |
| # "- Show quoted text -" and expand it in-line. |
| |
| # TODO(jrobbins): For now, we look for lines that indicate quoted |
| # text (e.g., they start with ">"). But, we should also collapse |
| # multiple lines that are identical to other lines in previous |
| # non-deleted comments on the same issue, regardless of quote markers. |
| |
| |
| # We cut off the message if we see something that looks like a signature and |
| # it is near the bottom of the message. |
| SIGNATURE_BOUNDARY_RE = re.compile( |
| r'^(([-_=]+ ?)+|' |
| r'cheers|(best |warm |kind )?regards|thx|thanks|thank you|' |
| r'Sent from my i?Phone|Sent from my iPod)' |
| r',? *$', re.I) |
| |
| MAX_SIGNATURE_LINES = 8 |
| |
| FORWARD_OR_EXPLICIT_SIG_PATS = [ |
| r'[^0-9a-z]+(forwarded|original) message[^0-9a-z]+\s*$', |
| r'Updates:\s*$', |
| r'Comment #\d+ on issue \d+ by \S+:', |
| # If we see this anywhere in the message, treat the rest as a signature. |
| r'--\s*$', |
| ] |
| FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE = re.compile( |
| r'^(%s)(.|\n)*' % '|'.join(FORWARD_OR_EXPLICIT_SIG_PATS), |
| flags=re.MULTILINE | re.IGNORECASE) |
| |
| # This handles gmail well, and it's pretty broad without seeming like |
| # it would cause false positives. |
| QUOTE_PATS = [ |
| r'^On .*\s+<\s*\S+?@[-a-z0-9.]+>\s*wrote:\s*$', |
| r'^On .* \S+?@[-a-z0-9.]+\s*wrote:\s*$', |
| r'^\S+?@[-a-z0-9.]+ \(\S+?@[-a-z0-9.]+\)\s*wrote:\s*$', |
| r'\S+?@[-a-z0-9]+.appspotmail.com\s.*wrote:\s*$', |
| r'\S+?@[-a-z0-9]+.appspotmail.com\s+.*a\s+\xc3\xa9crit\s*:\s*$', |
| r'^\d+/\d+/\d+ +<\S+@[-a-z0-9.]+>:?\s*$', |
| r'^>.*$', |
| ] |
| QUOTED_BLOCKS_RE = re.compile( |
| r'(^\s*\n)*((%s)\n?)+(^\s*\n)*' % '|'.join(QUOTE_PATS), |
| flags=re.MULTILINE | re.IGNORECASE) |
| |
| |
| def StripQuotedText(description): |
| """Strip all quoted text lines out of the given comment text.""" |
| # If the rest of message is forwared text, we're done. |
| description = FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE.sub('', description) |
| # Replace each quoted block of lines and surrounding blank lines with at |
| # most one blank line. |
| description = QUOTED_BLOCKS_RE.sub('\n', description) |
| |
| new_lines = description.strip().split('\n') |
| # Make another pass over the last few lines to strip out signatures. |
| sig_zone_start = max(0, len(new_lines) - MAX_SIGNATURE_LINES) |
| for idx in range(sig_zone_start, len(new_lines)): |
| line = new_lines[idx] |
| if SIGNATURE_BOUNDARY_RE.match(line): |
| # We found the likely start of a signature, just keep the lines above it. |
| new_lines = new_lines[:idx] |
| break |
| |
| return '\n'.join(new_lines).strip() |