Blame - framework/emailfmt.py - monorail-avm99963

blob: 2933feac6bbfe26afa4264a775081ab522e7cfb7 [file] [log] [blame]

Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	1	# Copyright 2016 The Chromium Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style
				3	# license that can be found in the LICENSE file or at
				4	# https://developers.google.com/open-source/licenses/bsd
				5
				6	"""Functions that format or parse email messages in Monorail.
				7
				8	Specifically, this module has the logic for generating various email
				9	header lines that help match inbound and outbound email to the project
				10	and artifact that generated it.
				11	"""
				12	from __future__ import print_function
				13	from __future__ import division
				14	from __future__ import absolute_import
				15
				16	import hmac
				17	import logging
				18	import re
				19	import rfc822
				20
				21	import six
				22
				23	from google.appengine.api import app_identity
				24
				25	import settings
				26	from framework import framework_constants
				27	from services import client_config_svc
				28	from services import secrets_svc
				29
				30	# TODO(jrobbins): Parsing very large messages is slow, and we are not going
				31	# to handle attachments at first, so there is no reason to consider large
				32	# emails.
				33	MAX_BODY_SIZE = 100 * 1024
				34	MAX_HEADER_CHARS_CONSIDERED = 255
				35
				36
				37	def _checkEmailHeaderPrefix(key):
				38	"""Ensures that a given email header starts with X-Alert2Monorail prefix."""
				39	# this is to catch typos in the email header prefix and raises an exception
				40	# during package loading time.
				41	assert key.startswith('X-Alert2Monorail')
				42	return key
				43
				44
				45	class AlertEmailHeader(object):
				46	"""A list of the email header keys supported by Alert2Monorail."""
				47	# pylint: disable=bad-whitespace
				48	#
				49	# The prefix has been hard-coded without string substitution to make them
				50	# searchable with the header keys.
				51	INCIDENT_ID = 'X-Incident-Id'
				52	OWNER = _checkEmailHeaderPrefix('X-Alert2Monorail-owner')
				53	CC = _checkEmailHeaderPrefix('X-Alert2Monorail-cc')
				54	PRIORITY = _checkEmailHeaderPrefix('X-Alert2Monorail-priority')
				55	STATUS = _checkEmailHeaderPrefix('X-Alert2Monorail-status')
				56	COMPONENT = _checkEmailHeaderPrefix('X-Alert2Monorail-component')
				57	OS = _checkEmailHeaderPrefix('X-Alert2Monorail-os')
				58	TYPE = _checkEmailHeaderPrefix('X-Alert2Monorail-type')
				59	LABEL = _checkEmailHeaderPrefix('X-Alert2Monorail-label')
				60
				61
				62	def IsBodyTooBigToParse(body):
				63	"""Return True if the email message body is too big to process."""
				64	return len(body) > MAX_BODY_SIZE
				65
				66
				67	def IsProjectAddressOnToLine(project_addr, to_addrs):
				68	"""Return True if an email was explicitly sent directly to us."""
				69	return project_addr in to_addrs
				70
				71
				72	def ParseEmailMessage(msg):
				73	"""Parse the given MessageRouterMessage and return relevant fields.
				74
				75	Args:
				76	msg: email.message.Message object for the email message sent to us.
				77
				78	Returns:
				79	A tuple: from_addr, to_addrs, cc_addrs, references,
				80	incident_id, subject, body.
				81	"""
				82	# Ignore messages that are probably not from humans, see:
				83	# http://google.com/search?q=precedence+bulk+junk
				84	precedence = msg.get('precedence', '')
				85	if precedence.lower() in ['bulk', 'junk']:
				86	logging.info('Precedence: %r indicates an autoresponder', precedence)
				87	return '', [], [], '', '', '', ''
				88
				89	from_addrs = _ExtractAddrs(msg.get('from', ''))
				90	if from_addrs:
				91	from_addr = from_addrs[0]
				92	else:
				93	from_addr = ''
				94
				95	to_addrs = _ExtractAddrs(msg.get('to', ''))
				96	cc_addrs = _ExtractAddrs(msg.get('cc', ''))
				97
				98	in_reply_to = msg.get('in-reply-to', '')
				99	incident_id = msg.get(AlertEmailHeader.INCIDENT_ID, '')
				100	references = msg.get('references', '').split()
				101	references = list({ref for ref in [in_reply_to] + references if ref})
				102	subject = _StripSubjectPrefixes(msg.get('subject', ''))
				103
				104	body = u''
				105	for part in msg.walk():
				106	# We only process plain text emails.
				107	if part.get_content_type() == 'text/plain':
				108	body = part.get_payload(decode=True)
				109	if not isinstance(body, six.text_type):
				110	body = body.decode('utf-8')
				111	break # Only consider the first text part.
				112
				113	return (from_addr, to_addrs, cc_addrs, references, incident_id, subject,
				114	body)
				115
				116
				117	def _ExtractAddrs(header_value):
				118	"""Given a message header value, return email address found there."""
				119	friendly_addr_pairs = list(rfc822.AddressList(header_value))
				120	return [addr for _friendly, addr in friendly_addr_pairs]
				121
				122
				123	def _StripSubjectPrefixes(subject):
				124	"""Strip off any 'Re:', 'Fwd:', etc. subject line prefixes."""
				125	prefix = _FindSubjectPrefix(subject)
				126	while prefix:
				127	subject = subject[len(prefix):].strip()
				128	prefix = _FindSubjectPrefix(subject)
				129
				130	return subject
				131
				132
				133	def _FindSubjectPrefix(subject):
				134	"""If the given subject starts with a prefix, return that prefix."""
				135	for prefix in ['re:', 'aw:', 'fwd:', 'fw:']:
				136	if subject.lower().startswith(prefix):
				137	return prefix
				138
				139	return None
				140
				141
				142	def MailDomain():
				143	"""Return the domain name where this app can recieve email."""
				144	if settings.unit_test_mode:
				145	return 'testbed-test.appspotmail.com'
				146
				147	# If running on a GAFYD domain, you must define an app alias on the
				148	# Application Settings admin web page. If you cannot reserve the matching
				149	# APP_ID for the alias, then specify it in settings.mail_domain.
				150	if settings.mail_domain:
				151	return settings.mail_domain
				152
				153	app_id = app_identity.get_application_id()
				154	if ':' in app_id:
				155	app_id = app_id.split(':')[-1]
				156
				157	return '%s.appspotmail.com' % app_id
				158
				159
				160	def FormatFriendly(commenter_view, sender, reveal_addr):
				161	"""Format the From: line to include the commenter's friendly name if given."""
				162	if commenter_view:
				163	site_name = settings.site_name.lower()
				164	if commenter_view.email in client_config_svc.GetServiceAccountMap():
				165	friendly = commenter_view.display_name
				166	elif reveal_addr:
				167	friendly = commenter_view.email
				168	else:
				169	friendly = u'%s\u2026@%s' % (
				170	commenter_view.obscured_username, commenter_view.domain)
				171	if '@' in sender:
				172	sender_username, sender_domain = sender.split('@', 1)
				173	sender = '%s+v2.%d@%s' % (
				174	sender_username, commenter_view.user_id, sender_domain)
				175	friendly = friendly.split('@')[0]
				176	return '%s via %s <%s>' % (friendly, site_name, sender)
				177	else:
				178	return sender
				179
				180
				181	def NoReplyAddress(commenter_view=None, reveal_addr=False):
				182	"""Return an address that ignores all messages sent to it."""
				183	# Note: We use "no_reply" with an underscore to avoid potential conflict
				184	# with any project name. Project names cannot have underscores.
				185	# Note: This does not take branded domains into account, but this address
				186	# is only used for email error messages and in the reply-to address
				187	# when the user is not allowed to reply.
				188	sender = 'no_reply@%s' % MailDomain()
				189	return FormatFriendly(commenter_view, sender, reveal_addr)
				190
				191
				192	def FormatFromAddr(project, commenter_view=None, reveal_addr=False,
				193	can_reply_to=True):
				194	"""Return a string to be used on the email From: line.
				195
				196	Args:
				197	project: Project PB for the project that the email is sent from.
				198	commenter_view: Optional UserView of the user who made a comment. We use
				199	the user's (potentially obscured) email address as their friendly name.
				200	reveal_addr: Optional bool. If False then the address is obscured.
				201	can_reply_to: Optional bool. If True then settings.send_email_as is used,
				202	otherwise settings.send_noreply_email_as is used.
				203
				204	Returns:
				205	A string that should be used in the From: line of outbound email
				206	notifications for the given project.
				207	"""
				208	addr_format = (settings.send_email_as_format if can_reply_to
				209	else settings.send_noreply_email_as_format)
				210	domain = settings.branded_domains.get(
				211	project.project_name, settings.branded_domains.get('*'))
				212	domain = domain or 'chromium.org'
				213	if domain.count('.') > 1:
				214	domain = '.'.join(domain.split('.')[-2:])
				215	addr = addr_format % {'domain': domain}
				216	return FormatFriendly(commenter_view, addr, reveal_addr)
				217
				218
				219	def NormalizeHeader(s):
				220	"""Make our message-ids robust against mail client spacing and truncation."""
				221	words = _StripSubjectPrefixes(s).split() # Split on any runs of whitespace.
				222	normalized = ' '.join(words)
				223	truncated = normalized[:MAX_HEADER_CHARS_CONSIDERED]
				224	return truncated
				225
				226
				227	def MakeMessageID(to_addr, subject, from_addr):
				228	"""Make a unique (but deterministic) email Message-Id: value."""
				229	normalized_subject = NormalizeHeader(subject)
				230	if isinstance(normalized_subject, six.text_type):
				231	normalized_subject = normalized_subject.encode('utf-8')
				232	mail_hmac_key = secrets_svc.GetEmailKey()
				233	return '<0=%s=%s=%s@%s>' % (
				234	hmac.new(mail_hmac_key, to_addr).hexdigest(),
				235	hmac.new(mail_hmac_key, normalized_subject).hexdigest(),
				236	from_addr.split('@')[0],
				237	MailDomain())
				238
				239
				240	def GetReferences(to_addr, subject, seq_num, project_from_addr):
				241	"""Make a References: header to make this message thread properly.
				242
				243	Args:
				244	to_addr: address that email message will be sent to.
				245	subject: subject line of email message.
				246	seq_num: sequence number of message in thread, e.g., 0, 1, 2, ...,
				247	or None if the message is not part of a thread.
				248	project_from_addr: address that the message will be sent from.
				249
				250	Returns:
				251	A string Message-ID that does not correspond to any actual email
				252	message that was ever sent, but it does serve to unite all the
				253	messages that belong togther in a thread.
				254	"""
				255	if seq_num is not None:
				256	return MakeMessageID(to_addr, subject, project_from_addr)
				257	else:
				258	return ''
				259
				260
				261	def ValidateReferencesHeader(message_ref, project, from_addr, subject):
				262	"""Check that the References header is one that we could have sent.
				263
				264	Args:
				265	message_ref: one of the References header values from the inbound email.
				266	project: Project PB for the affected project.
				267	from_addr: string email address that inbound email was sent from.
				268	subject: string base subject line of inbound email.
				269
				270	Returns:
				271	True if it looks like this is a reply to a message that we sent
				272	to the same address that replied. Otherwise, False.
				273	"""
				274	sender = '%s@%s' % (project.project_name, MailDomain())
				275	expected_ref = MakeMessageID(from_addr, subject, sender)
				276
				277	# TODO(jrobbins): project option to not check from_addr.
				278	# TODO(jrobbins): project inbound auth token.
				279	return expected_ref == message_ref
				280
				281
				282	PROJECT_EMAIL_RE = re.compile(
				283	r'(?P<project>[-a-z0-9]+)'
				284	r'(\+(?P<verb>[a-z0-9]+)(\+(?P<label>[a-z0-9-]+))?)?'
				285	r'@(?P<domain>[-a-z0-9.]+)')
				286
				287	ISSUE_CHANGE_SUBJECT_RE = re.compile(
				288	r'Issue (?P<local_id>[0-9]+) in '
				289	r'(?P<project>[-a-z0-9]+): '
				290	r'(?P<summary>.+)')
				291
				292	ISSUE_CHANGE_COMPACT_SUBJECT_RE = re.compile(
				293	r'(?P<project>[-a-z0-9]+):'
				294	r'(?P<local_id>[0-9]+): '
				295	r'(?P<summary>.+)')
				296
				297
				298	def IdentifyIssue(project_name, subject):
				299	"""Parse the artifact id from a reply and verify it is a valid issue.
				300
				301	Args:
				302	project_name: string the project to search for the issue in.
				303	subject: string email subject line received, it must match the one
				304	sent. Leading prefixes like "Re:" should already have been stripped.
				305
				306	Returns:
				307	An int local_id for the id of the issue. None if no id is found or the id
				308	is not valid.
				309	"""
				310
				311	issue_project_name, local_id_str = _MatchSubject(subject)
				312
				313	if project_name != issue_project_name:
				314	# Something is wrong with the project name.
				315	return None
				316
				317	logging.info('project_name = %r', project_name)
				318	logging.info('local_id_str = %r', local_id_str)
				319
				320	try:
				321	local_id = int(local_id_str)
				322	except (ValueError, TypeError):
				323	local_id = None
				324
				325	return local_id
				326
				327
				328	def IdentifyProjectVerbAndLabel(project_addr):
				329	# Ignore any inbound email sent to a "no_reply@" address.
				330	if project_addr.startswith('no_reply@'):
				331	return None, None, None
				332
				333	project_name = None
				334	verb = None
				335	label = None
				336	m = PROJECT_EMAIL_RE.match(project_addr.lower())
				337	if m:
				338	project_name = m.group('project')
				339	verb = m.group('verb')
				340	label = m.group('label')
				341
				342	return project_name, verb, label
				343
				344
				345	def _MatchSubject(subject):
				346	"""Parse the project, artifact type, and artifact id from a subject line."""
				347	m = (ISSUE_CHANGE_SUBJECT_RE.match(subject) or
				348	ISSUE_CHANGE_COMPACT_SUBJECT_RE.match(subject))
				349	if m:
				350	return m.group('project'), m.group('local_id')
				351
				352	return None, None
				353
				354
				355	# TODO(jrobbins): For now, we strip out lines that look like quoted
				356	# text and then will give the user the option to see the whole email.
				357	# For 2.0 of this feature, we should change the Comment PB to have
				358	# runs of text with different properties so that the UI can present
				359	# "- Show quoted text -" and expand it in-line.
				360
				361	# TODO(jrobbins): For now, we look for lines that indicate quoted
				362	# text (e.g., they start with ">"). But, we should also collapse
				363	# multiple lines that are identical to other lines in previous
				364	# non-deleted comments on the same issue, regardless of quote markers.
				365
				366
				367	# We cut off the message if we see something that looks like a signature and
				368	# it is near the bottom of the message.
				369	SIGNATURE_BOUNDARY_RE = re.compile(
				370	r'^(([-_=]+ ?)+\|'
				371	r'cheers\|(best \|warm \|kind )?regards\|thx\|thanks\|thank you\|'
				372	r'Sent from my i?Phone\|Sent from my iPod)'
				373	r',? *$', re.I)
				374
				375	MAX_SIGNATURE_LINES = 8
				376
				377	FORWARD_OR_EXPLICIT_SIG_PATS = [
				378	r'[^0-9a-z]+(forwarded\|original) message[^0-9a-z]+\s*$',
				379	r'Updates:\s*$',
				380	r'Comment #\d+ on issue \d+ by \S+:',
				381	# If we see this anywhere in the message, treat the rest as a signature.
				382	r'--\s*$',
				383	]
				384	FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE = re.compile(
				385	r'^(%s)(.\|\n)*' % '\|'.join(FORWARD_OR_EXPLICIT_SIG_PATS),
				386	flags=re.MULTILINE \| re.IGNORECASE)
				387
				388	# This handles gmail well, and it's pretty broad without seeming like
				389	# it would cause false positives.
				390	QUOTE_PATS = [
				391	r'^On .\s+<\s\S+?@[-a-z0-9.]+>\swrote:\s$',
				392	r'^On .* \S+?@[-a-z0-9.]+\swrote:\s$',
				393	r'^\S+?@[-a-z0-9.]+ $\S+?@[-a-z0-9.]+$\swrote:\s$',
				394	r'\S+?@[-a-z0-9]+.appspotmail.com\s.wrote:\s$',
				395	r'\S+?@[-a-z0-9]+.appspotmail.com\s+.a\s+\xc3\xa9crit\s:\s*$',
				396	r'^\d+/\d+/\d+ +<\S+@[-a-z0-9.]+>:?\s*$',
				397	r'^>.*$',
				398	]
				399	QUOTED_BLOCKS_RE = re.compile(
				400	r'(^\s\n)((%s)\n?)+(^\s\n)' % '\|'.join(QUOTE_PATS),
				401	flags=re.MULTILINE \| re.IGNORECASE)
				402
				403
				404	def StripQuotedText(description):
				405	"""Strip all quoted text lines out of the given comment text."""
				406	# If the rest of message is forwared text, we're done.
				407	description = FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE.sub('', description)
				408	# Replace each quoted block of lines and surrounding blank lines with at
				409	# most one blank line.
				410	description = QUOTED_BLOCKS_RE.sub('\n', description)
				411
				412	new_lines = description.strip().split('\n')
				413	# Make another pass over the last few lines to strip out signatures.
				414	sig_zone_start = max(0, len(new_lines) - MAX_SIGNATURE_LINES)
				415	for idx in range(sig_zone_start, len(new_lines)):
				416	line = new_lines[idx]
				417	if SIGNATURE_BOUNDARY_RE.match(line):
				418	# We found the likely start of a signature, just keep the lines above it.
				419	new_lines = new_lines[:idx]
				420	break
				421
				422	return '\n'.join(new_lines).strip()