Blame - framework/filecontent.py - monorail-avm99963

blob: 7e7964391f8ed7e95bfe696259aab3ca089a2681 [file] [log] [blame]

Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame^]	1	# Copyright 2016 The Chromium Authors
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	4
				5	"""Utility routines for dealing with MIME types and decoding text files."""
				6
				7	from __future__ import division
				8	from __future__ import print_function
				9	from __future__ import absolute_import
				10
				11	import itertools
				12	import logging
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame^]	13	import six
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	14
				15	from framework import framework_constants
				16
				17
				18	_EXTENSION_TO_CTYPE_TABLE = {
				19	# These are images/PDFs that we trust the browser to display.
				20	'gif': 'image/gif',
				21	'jpg': 'image/jpeg',
				22	'jpeg': 'image/jpeg',
				23	'png': 'image/png',
				24	'webp': 'image/webp',
				25	'ico': 'image/x-icon',
				26	'svg': 'image/svg+xml',
				27	'pdf': 'application/pdf',
				28	'ogv': 'video/ogg',
				29	'mov': 'video/quicktime',
				30	'mp4': 'video/mp4',
				31	'mpg': 'video/mp4',
				32	'mpeg': 'video/mp4',
				33	'webm': 'video/webm',
				34
				35	# We do not serve mimetypes that cause the brower to launch a local
				36	# app because that is not required for issue tracking and it is a
				37	# potential security risk.
				38	}
				39
				40
				41	def GuessContentTypeFromFilename(filename):
				42	"""Guess a file's content type based on the filename extension.
				43
				44	Args:
				45	filename: String name of a file.
				46
				47	Returns:
				48	MIME type string to use when serving this file. We only use text/plain for
				49	text files, appropriate image content-types, or application/octet-stream
				50	for virtually all binary files. This limits the richness of the user's
				51	experience, e.g., the user cannot open an MS Office application directly
				52	by clicking on an attachment, but it is safer.
				53	"""
				54	ext = filename.split('.')[-1] if ('.' in filename) else ''
				55	ext = ext.lower()
				56	if ext in COMMON_TEXT_FILE_EXTENSIONS:
				57	return 'text/plain'
				58	return _EXTENSION_TO_CTYPE_TABLE.get(ext.lower(), 'application/octet-stream')
				59
				60
				61	# Constants used in detecting if a file has binary content.
				62	# All line lengths must be below the upper limit, and there must be a spefic
				63	# ratio below the lower limit.
				64	_MAX_SOURCE_LINE_LEN_LOWER = 350
				65	_MAX_SOURCE_LINE_LEN_UPPER = 800
				66	_SOURCE_LINE_LEN_LOWER_RATIO = 0.9
				67
				68	# Message to display for undecodable commit log or author values.
				69	UNDECODABLE_LOG_CONTENT = '[Cannot be displayed]'
				70
				71	# How large a repository file is in bytes before we don't try to display it
				72	SOURCE_FILE_MAX_SIZE = 1000 * 1024
				73	SOURCE_FILE_MAX_LINES = 50000
				74
				75	# The source code browser will not attempt to display any filename ending
				76	# with one of these extensions.
				77	COMMON_BINARY_FILE_EXTENSIONS = {
				78	'gif', 'jpg', 'jpeg', 'psd', 'ico', 'icon', 'xbm', 'xpm', 'xwd', 'pcx',
				79	'bmp', 'png', 'vsd,' 'mpg', 'mpeg', 'wmv', 'wmf', 'avi', 'flv', 'snd',
				80	'mp3', 'wma', 'exe', 'dll', 'bin', 'class', 'o', 'so', 'lib', 'dylib',
				81	'jar', 'ear', 'war', 'par', 'msi', 'tar', 'zip', 'rar', 'cab', 'z', 'gz',
				82	'bz2', 'dmg', 'iso', 'rpm', 'pdf', 'eps', 'tif', 'tiff', 'xls', 'ppt',
				83	'graffie', 'violet', 'webm', 'webp',
				84	}
				85
				86	# The source code browser will display file contents as text data for files
				87	# with the following extensions or exact filenames (assuming they decode
				88	# correctly).
				89	COMMON_TEXT_FILE_EXTENSIONS = (
				90	set(framework_constants.PRETTIFY_CLASS_MAP.keys()) \| {
				91	'',
				92	'ada',
				93	'asan',
				94	'asm',
				95	'asp',
				96	'bat',
				97	'cgi',
				98	'csv',
				99	'diff',
				100	'el',
				101	'emacs',
				102	'jsp',
				103	'log',
				104	'markdown',
				105	'md',
				106	'mf',
				107	'patch',
				108	'plist',
				109	'properties',
				110	'r',
				111	'rc',
				112	'txt',
				113	'vim',
				114	'wiki',
				115	'xemacs',
				116	'yacc',
				117	})
				118	COMMON_TEXT_FILENAMES = (
				119	set(framework_constants.PRETTIFY_FILENAME_CLASS_MAP.keys()) \|
				120	{'authors', 'install', 'readme'})
				121
				122
				123	def DecodeFileContents(file_contents, path=None):
				124	"""Try converting file contents to unicode using utf-8 or latin-1.
				125
				126	This is applicable to untrusted maybe-text from vcs files or inbound emails.
				127
				128	We try decoding the file as utf-8, then fall back on latin-1. In the former
				129	case, we call the file a text file; in the latter case, we guess whether
				130	the file is text or binary based on line length.
				131
				132	If we guess text when the file is binary, the user sees safely encoded
				133	gibberish. If the other way around, the user sees a message that we will
				134	not display the file.
				135
				136	TODO(jrobbins): we could try the user-supplied encoding, iff it
				137	is one of the encodings that we know that we can handle.
				138
				139	Args:
				140	file_contents: byte string from uploaded file. It could be text in almost
				141	any encoding, or binary. We cannot trust the user-supplied encoding
				142	in the mime-type property.
				143	path: string pathname of file.
				144
				145	Returns:
				146	The tuple (unicode_string, is_binary, is_long):
				147	- The unicode version of the string.
				148	- is_binary is true if the string could not be decoded as text.
				149	- is_long is true if the file has more than SOURCE_FILE_MAX_LINES lines.
				150	"""
				151	# If the filename is one that typically identifies a binary file, then
				152	# just treat it as binary without any further analysis.
				153	ext = None
				154	if path and '.' in path:
				155	ext = path.split('.')[-1]
				156	if ext.lower() in COMMON_BINARY_FILE_EXTENSIONS:
				157	# If the file is binary, we don't care about the length, since we don't
				158	# show or diff it.
				159	return u'', True, False
				160
				161	# If the string can be decoded as utf-8, we treat it as textual.
				162	try:
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame^]	163	u_str = six.ensure_text(file_contents)
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	164	is_long = len(u_str.split('\n')) > SOURCE_FILE_MAX_LINES
				165	return u_str, False, is_long
				166	except UnicodeDecodeError:
				167	logging.info('not a utf-8 file: %s bytes', len(file_contents))
				168
				169	# Fall back on latin-1. This will always succeed, since every byte maps to
				170	# something in latin-1, even if that something is gibberish.
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame^]	171	u_str = six.ensure_text(file_contents, encoding='latin-1')
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	172
				173	lines = u_str.split('\n')
				174	is_long = len(lines) > SOURCE_FILE_MAX_LINES
				175	# Treat decodable files with certain filenames and/or extensions as text
				176	# files. This avoids problems with common file types using our text/binary
				177	# heuristic rules below.
				178	if path:
				179	name = path.split('/')[-1]
				180	if (name.lower() in COMMON_TEXT_FILENAMES or
				181	(ext and ext.lower() in COMMON_TEXT_FILE_EXTENSIONS)):
				182	return u_str, False, is_long
				183
				184	# HEURISTIC: Binary files can qualify as latin-1, so we need to
				185	# check further. Any real source code is going to be divided into
				186	# reasonably sized lines. All lines must be below an upper character limit,
				187	# and most lines must be below a lower limit. This allows some exceptions
				188	# to the lower limit, but is more restrictive than just using a single
				189	# large character limit.
				190	is_binary = False
				191	lower_count = 0
				192	for line in itertools.islice(lines, SOURCE_FILE_MAX_LINES):
				193	size = len(line)
				194	if size <= _MAX_SOURCE_LINE_LEN_LOWER:
				195	lower_count += 1
				196	elif size > _MAX_SOURCE_LINE_LEN_UPPER:
				197	is_binary = True
				198	break
				199
				200	ratio = lower_count / float(max(1, len(lines)))
				201	if ratio < _SOURCE_LINE_LEN_LOWER_RATIO:
				202	is_binary = True
				203
				204	return u_str, is_binary, is_long