blob: e14075c7c21fda271d3d46d32d316b2be6a80b5d [file] [log] [blame]
# Copyright 2016 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Functions that format or parse email messages in Monorail.
Specifically, this module has the logic for generating various email
header lines that help match inbound and outbound email to the project
and artifact that generated it.
"""
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import hashlib
import hmac
import logging
import re
import six
if six.PY2:
import rfc822
else:
import email.utils
from google.appengine.api import app_identity
import settings
from framework import framework_constants
from services import client_config_svc
from services import secrets_svc
# TODO(jrobbins): Parsing very large messages is slow, and we are not going
# to handle attachments at first, so there is no reason to consider large
# emails.
MAX_BODY_SIZE = 100 * 1024
MAX_HEADER_CHARS_CONSIDERED = 255
def _checkEmailHeaderPrefix(key):
"""Ensures that a given email header starts with X-Alert2Monorail prefix."""
# this is to catch typos in the email header prefix and raises an exception
# during package loading time.
assert key.startswith('X-Alert2Monorail')
return key
class AlertEmailHeader(object):
"""A list of the email header keys supported by Alert2Monorail."""
# pylint: disable=bad-whitespace
#
# The prefix has been hard-coded without string substitution to make them
# searchable with the header keys.
INCIDENT_ID = 'X-Incident-Id'
OWNER = _checkEmailHeaderPrefix('X-Alert2Monorail-owner')
CC = _checkEmailHeaderPrefix('X-Alert2Monorail-cc')
PRIORITY = _checkEmailHeaderPrefix('X-Alert2Monorail-priority')
STATUS = _checkEmailHeaderPrefix('X-Alert2Monorail-status')
COMPONENT = _checkEmailHeaderPrefix('X-Alert2Monorail-component')
OS = _checkEmailHeaderPrefix('X-Alert2Monorail-os')
TYPE = _checkEmailHeaderPrefix('X-Alert2Monorail-type')
LABEL = _checkEmailHeaderPrefix('X-Alert2Monorail-label')
def IsBodyTooBigToParse(body):
"""Return True if the email message body is too big to process."""
return len(body) > MAX_BODY_SIZE
def IsProjectAddressOnToLine(project_addr, to_addrs):
"""Return True if an email was explicitly sent directly to us."""
return project_addr in to_addrs
def ParseEmailMessage(msg):
"""Parse the given MessageRouterMessage and return relevant fields.
Args:
msg: email.message.Message object for the email message sent to us.
Returns:
A tuple: from_addr, to_addrs, cc_addrs, references,
incident_id, subject, body.
"""
# Ignore messages that are probably not from humans, see:
# http://google.com/search?q=precedence+bulk+junk
precedence = msg.get('precedence', '')
if precedence.lower() in ['bulk', 'junk']:
logging.info('Precedence: %r indicates an autoresponder', precedence)
return '', [], [], '', '', '', ''
from_addrs = _ExtractAddrs(msg.get('from', ''))
if from_addrs:
from_addr = from_addrs[0]
else:
from_addr = ''
to_addrs = _ExtractAddrs(msg.get('to', ''))
cc_addrs = _ExtractAddrs(msg.get('cc', ''))
in_reply_to = msg.get('in-reply-to', '')
incident_id = msg.get(AlertEmailHeader.INCIDENT_ID, '')
references = msg.get('references', '').split()
references = list({ref for ref in [in_reply_to] + references if ref})
subject = _StripSubjectPrefixes(msg.get('subject', ''))
body = u''
for part in msg.walk():
# We only process plain text emails.
if part.get_content_type() == 'text/plain':
body = part.get_payload(decode=True)
if not isinstance(body, six.text_type):
body = body.decode('utf-8')
break # Only consider the first text part.
return (from_addr, to_addrs, cc_addrs, references, incident_id, subject,
body)
def _ExtractAddrs(header_value):
"""Given a message header value, return email address found there."""
if six.PY2:
friendly_addr_pairs = list(rfc822.AddressList(header_value))
else:
friendly_addr_pairs = email.utils.getaddresses([header_value])
return [addr for _friendly, addr in friendly_addr_pairs]
def _StripSubjectPrefixes(subject):
"""Strip off any 'Re:', 'Fwd:', etc. subject line prefixes."""
prefix = _FindSubjectPrefix(subject)
while prefix:
subject = subject[len(prefix):].strip()
prefix = _FindSubjectPrefix(subject)
return subject
def _FindSubjectPrefix(subject):
"""If the given subject starts with a prefix, return that prefix."""
for prefix in ['re:', 'aw:', 'fwd:', 'fw:']:
if subject.lower().startswith(prefix):
return prefix
return None
def MailDomain():
"""Return the domain name where this app can recieve email."""
if settings.unit_test_mode:
return 'testbed-test.appspotmail.com'
# If running on a GAFYD domain, you must define an app alias on the
# Application Settings admin web page. If you cannot reserve the matching
# APP_ID for the alias, then specify it in settings.mail_domain.
if settings.mail_domain:
return settings.mail_domain
app_id = app_identity.get_application_id()
if ':' in app_id:
app_id = app_id.split(':')[-1]
return '%s.appspotmail.com' % app_id
def FormatFriendly(commenter_view, sender, reveal_addr):
"""Format the From: line to include the commenter's friendly name if given."""
if commenter_view:
site_name = settings.site_name.lower()
if commenter_view.email in client_config_svc.GetServiceAccountMap():
friendly = commenter_view.display_name
elif reveal_addr:
friendly = commenter_view.email
else:
friendly = u'%s\u2026@%s' % (
commenter_view.obscured_username, commenter_view.domain)
if '@' in sender:
sender_username, sender_domain = sender.split('@', 1)
sender = '%s+v2.%d@%s' % (
sender_username, commenter_view.user_id, sender_domain)
friendly = friendly.split('@')[0]
return '%s via %s <%s>' % (friendly, site_name, sender)
else:
return sender
def NoReplyAddress(commenter_view=None, reveal_addr=False):
"""Return an address that ignores all messages sent to it."""
# Note: We use "no_reply" with an underscore to avoid potential conflict
# with any project name. Project names cannot have underscores.
# Note: This does not take branded domains into account, but this address
# is only used for email error messages and in the reply-to address
# when the user is not allowed to reply.
sender = 'no_reply@%s' % MailDomain()
return FormatFriendly(commenter_view, sender, reveal_addr)
def FormatFromAddr(project, commenter_view=None, reveal_addr=False,
can_reply_to=True):
"""Return a string to be used on the email From: line.
Args:
project: Project PB for the project that the email is sent from.
commenter_view: Optional UserView of the user who made a comment. We use
the user's (potentially obscured) email address as their friendly name.
reveal_addr: Optional bool. If False then the address is obscured.
can_reply_to: Optional bool. If True then settings.send_email_as is used,
otherwise settings.send_noreply_email_as is used.
Returns:
A string that should be used in the From: line of outbound email
notifications for the given project.
"""
addr_format = (settings.send_email_as_format if can_reply_to
else settings.send_noreply_email_as_format)
domain = settings.branded_domains.get(
project.project_name, settings.branded_domains.get('*'))
domain = domain or 'chromium.org'
if domain.count('.') > 1:
domain = '.'.join(domain.split('.')[-2:])
addr = addr_format % {'domain': domain}
return FormatFriendly(commenter_view, addr, reveal_addr)
def NormalizeHeader(s):
"""Make our message-ids robust against mail client spacing and truncation."""
words = _StripSubjectPrefixes(s).split() # Split on any runs of whitespace.
normalized = ' '.join(words)
truncated = normalized[:MAX_HEADER_CHARS_CONSIDERED]
return truncated
def MakeMessageID(to_addr, subject, from_addr):
"""Make a unique (but deterministic) email Message-Id: value."""
normalized_subject = NormalizeHeader(subject)
if isinstance(normalized_subject, six.text_type):
normalized_subject = normalized_subject.encode('utf-8')
mail_hmac_key = secrets_svc.GetEmailKey()
to_addr_hash = hmac.new(
mail_hmac_key, six.ensure_binary(to_addr),
digestmod=hashlib.md5).hexdigest()
subject_hash = hmac.new(
mail_hmac_key,
six.ensure_binary(normalized_subject),
digestmod=hashlib.md5).hexdigest()
return '<0=%s=%s=%s@%s>' % (
to_addr_hash, subject_hash, from_addr.split('@')[0], MailDomain())
def GetReferences(to_addr, subject, seq_num, project_from_addr):
"""Make a References: header to make this message thread properly.
Args:
to_addr: address that email message will be sent to.
subject: subject line of email message.
seq_num: sequence number of message in thread, e.g., 0, 1, 2, ...,
or None if the message is not part of a thread.
project_from_addr: address that the message will be sent from.
Returns:
A string Message-ID that does not correspond to any actual email
message that was ever sent, but it does serve to unite all the
messages that belong togther in a thread.
"""
if seq_num is not None:
return MakeMessageID(to_addr, subject, project_from_addr)
else:
return ''
def ValidateReferencesHeader(message_ref, project, from_addr, subject):
"""Check that the References header is one that we could have sent.
Args:
message_ref: one of the References header values from the inbound email.
project: Project PB for the affected project.
from_addr: string email address that inbound email was sent from.
subject: string base subject line of inbound email.
Returns:
True if it looks like this is a reply to a message that we sent
to the same address that replied. Otherwise, False.
"""
sender = '%s@%s' % (project.project_name, MailDomain())
expected_ref = MakeMessageID(from_addr, subject, sender)
# TODO(jrobbins): project option to not check from_addr.
# TODO(jrobbins): project inbound auth token.
return expected_ref == message_ref
PROJECT_EMAIL_RE = re.compile(
r'(?P<project>[-a-z0-9]+)'
r'(\+(?P<verb>[a-z0-9]+)(\+(?P<label>[a-z0-9-]+))?)?'
r'@(?P<domain>[-a-z0-9.]+)')
ISSUE_CHANGE_SUBJECT_RE = re.compile(
r'Issue (?P<local_id>[0-9]+) in '
r'(?P<project>[-a-z0-9]+): '
r'(?P<summary>.+)')
ISSUE_CHANGE_COMPACT_SUBJECT_RE = re.compile(
r'(?P<project>[-a-z0-9]+):'
r'(?P<local_id>[0-9]+): '
r'(?P<summary>.+)')
def IdentifyIssue(project_name, subject):
"""Parse the artifact id from a reply and verify it is a valid issue.
Args:
project_name: string the project to search for the issue in.
subject: string email subject line received, it must match the one
sent. Leading prefixes like "Re:" should already have been stripped.
Returns:
An int local_id for the id of the issue. None if no id is found or the id
is not valid.
"""
issue_project_name, local_id_str = _MatchSubject(subject)
if project_name != issue_project_name:
# Something is wrong with the project name.
return None
logging.info('project_name = %r', project_name)
logging.info('local_id_str = %r', local_id_str)
try:
local_id = int(local_id_str)
except (ValueError, TypeError):
local_id = None
return local_id
def IdentifyProjectVerbAndLabel(project_addr):
# Ignore any inbound email sent to a "no_reply@" address.
if project_addr.startswith('no_reply@'):
return None, None, None
project_name = None
verb = None
label = None
m = PROJECT_EMAIL_RE.match(project_addr.lower())
if m:
project_name = m.group('project')
verb = m.group('verb')
label = m.group('label')
return project_name, verb, label
def _MatchSubject(subject):
"""Parse the project, artifact type, and artifact id from a subject line."""
m = (ISSUE_CHANGE_SUBJECT_RE.match(subject) or
ISSUE_CHANGE_COMPACT_SUBJECT_RE.match(subject))
if m:
return m.group('project'), m.group('local_id')
return None, None
# TODO(jrobbins): For now, we strip out lines that look like quoted
# text and then will give the user the option to see the whole email.
# For 2.0 of this feature, we should change the Comment PB to have
# runs of text with different properties so that the UI can present
# "- Show quoted text -" and expand it in-line.
# TODO(jrobbins): For now, we look for lines that indicate quoted
# text (e.g., they start with ">"). But, we should also collapse
# multiple lines that are identical to other lines in previous
# non-deleted comments on the same issue, regardless of quote markers.
# We cut off the message if we see something that looks like a signature and
# it is near the bottom of the message.
SIGNATURE_BOUNDARY_RE = re.compile(
r'^(([-_=]+ ?)+|'
r'cheers|(best |warm |kind )?regards|thx|thanks|thank you|'
r'Sent from my i?Phone|Sent from my iPod)'
r',? *$', re.I)
MAX_SIGNATURE_LINES = 8
FORWARD_OR_EXPLICIT_SIG_PATS = [
r'[^0-9a-z]+(forwarded|original) message[^0-9a-z]+\s*$',
r'Updates:\s*$',
r'Comment #\d+ on issue \d+ by \S+:',
# If we see this anywhere in the message, treat the rest as a signature.
r'--\s*$',
]
FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE = re.compile(
r'^(%s)(.|\n)*' % '|'.join(FORWARD_OR_EXPLICIT_SIG_PATS),
flags=re.MULTILINE | re.IGNORECASE)
# This handles gmail well, and it's pretty broad without seeming like
# it would cause false positives.
QUOTE_PATS = [
r'^On .*\s+<\s*\S+?@[-a-z0-9.]+>\s*wrote:\s*$',
r'^On .* \S+?@[-a-z0-9.]+\s*wrote:\s*$',
r'^\S+?@[-a-z0-9.]+ \(\S+?@[-a-z0-9.]+\)\s*wrote:\s*$',
r'\S+?@[-a-z0-9]+.appspotmail.com\s.*wrote:\s*$',
r'\S+?@[-a-z0-9]+.appspotmail.com\s+.*a\s+\xc3\xa9crit\s*:\s*$',
r'^\d+/\d+/\d+ +<\S+@[-a-z0-9.]+>:?\s*$',
r'^>.*$',
]
QUOTED_BLOCKS_RE = re.compile(
r'(^\s*\n)*((%s)\n?)+(^\s*\n)*' % '|'.join(QUOTE_PATS),
flags=re.MULTILINE | re.IGNORECASE)
def StripQuotedText(description):
"""Strip all quoted text lines out of the given comment text."""
# If the rest of message is forwared text, we're done.
description = FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE.sub('', description)
# Replace each quoted block of lines and surrounding blank lines with at
# most one blank line.
description = QUOTED_BLOCKS_RE.sub('\n', description)
new_lines = description.strip().split('\n')
# Make another pass over the last few lines to strip out signatures.
sig_zone_start = max(0, len(new_lines) - MAX_SIGNATURE_LINES)
for idx in range(sig_zone_start, len(new_lines)):
line = new_lines[idx]
if SIGNATURE_BOUNDARY_RE.match(line):
# We found the likely start of a signature, just keep the lines above it.
new_lines = new_lines[:idx]
break
return '\n'.join(new_lines).strip()