ABX/abx/parsers.py

491 lines
17 KiB
Python
Raw Normal View History

# parsers (sub-package)
"""
Filename Parsers & Registry for FileContext.
"""
import re, copy, os
import yaml
NameParsers = {} # Parser registry
def registered_parser(parser):
"""
Decorator function to register a parser class.
"""
NameParsers[parser.name] = parser
return parser
wordre = re.compile(r'([A-Z][a-z]+|[a-z]+|[0-9]+|[A-Z][A-Z]+)')
@registered_parser
class Parser_ABX_Episode:
"""
Original "Lunatics!" filename parsing algorithm. (DEPRECATED)
This parser was written before the Schema parser. It hard-codes the schema used
in the "Lunatics!" Project, and can probably be safely replaced by using the Schema
parser with appropriate YAML settings in the <project>.yaml file, which also allows
much more flexibility in naming schemes.
YAML parameter settings available for this parser:
---
definitions:
parser: abx_episode # Force use of this parser
parser_options: # Available settings (w/ defaults)
field_separator: '-'
episode_separator: 'E'
filetype_separator: '.'
Filetypes and roles are hard-code, and can't be changed from the YAML.
Assumes field-based filenames of the form:
<series>E<episode>[-<seq>[-<block>[-Cam<camera>][-<shot>]]][-<title>]-<role>.<filetype>
Where the <field> indicates fields with fieldnames, and there are three expected separators:
- is the 'field_separator'
E is the 'episode_separator'
. is the 'filetype_separator'
(These can be overridden in the initialization).
The class is callable, taking a string as input and returning a dictionary of fields.
"""
name = 'abx_episode'
max_score = 10 # Maximum number of fields parsed
# supported values for filetype
filetypes = {
'blend': "Blender File",
'kdenlive': "Kdenlive Video Editor File",
'mlt': "Kdenlive Video Mix Script",
'svg': "Scalable Vector Graphics (Inkscape)",
'kra': "Krita Graphic File",
'xcf': "Gimp Graphic File",
'png': "Portable Network Graphics (PNG) Image",
'jpg': "Joint Photographic Experts Group (JPEG) Image",
'aup': "Audacity Project",
'ardour': "Ardour Project",
'flac': "Free Lossless Audio Codec (FLAC)",
'mp3': "MPEG Audio Layer III (MP3) Audio File",
'ogg': "Ogg Vorbis Audio File",
'avi': "Audio Video Interleave (AVI) Video Container",
'mkv': "Matroska Video Container",
'mp4': "Moving Picture Experts Group (MPEG) 4 Format}",
'txt': "Plain Text File"
}
# Roles that make sense in an episode context
roles = {
'extras': "Extras, crowds, auxillary animated movement",
'mech': "Mechanical animation",
'anim': "Character animation",
'cam': "Camera direction",
'vfx': "Visual special effects",
'compos': "Compositing",
'bkg': "Background 2D image",
'bb': "Billboard 2D image",
'tex': "Texture 2D image",
'foley': "Foley sound",
'voice': "Voice recording",
'fx': "Sound effects",
'music': "Music track",
'cue': "Musical cue",
'amb': "Ambient sound",
'loop': "Ambient sound loop",
'edit': "Video edit"
}
# A few filetypes imply their roles:
roles_by_filetype = {
'kdenlive': 'edit',
'mlt': 'edit'
}
def __init__(self, field_separator='-', episode_separator='E', filetype_separator='.',
fields=None, filetypes=None, roles=None, **kwargs):
if not fields:
fields = {}
if filetypes:
self.filetypes = copy.deepcopy(self.filetypes) # Copy class attribute to instance
self.filetypes.update(filetypes) # Update with new values
if roles:
self.roles = copy.deepcopy(self.roles) # Copy class attribute to instance
self.roles.update(roles) # Update with new values
self.field_separator = field_separator
self.episode_separator = episode_separator
self.filetype_separator = filetype_separator
def __call__(self, filename, namepath):
score = 0.0
fielddata = {}
# Check for filetype ending
i_filetype = filename.rfind(self.filetype_separator)
if i_filetype < 0:
fielddata['filetype'] = None
else:
fielddata['filetype'] = filename[i_filetype+1:]
filename = filename[:i_filetype]
score = score + 1.0
components = filename.split(self.field_separator)
# Check for role marker in last component
if components[-1] in self.roles:
fielddata['role'] = components[-1]
del components[-1]
fielddata['hierarchy'] = 'episode'
score = score + 2.0
elif fielddata['filetype'] in self.roles_by_filetype:
fielddata['role'] = self.roles_by_filetype[fielddata['filetype']]
fielddata['hierarchy'] = 'episode'
else:
fielddata['role'] = None
fielddata['hierarchy'] = None
# Check for a descriptive title (must be 3+ characters in length)
if components and len(components[-1])>2:
# Normalize the title as words with spaces
title = ' '.join(w for w in wordre.split(components[-1]) if wordre.fullmatch(w))
del components[-1]
score = score + 1.0
else:
title = None
# Check if first field contains series/episode number
if components:
prefix = components[0]
try:
fielddata['series'] = {}
fielddata['episode'] = {}
fielddata['series']['code'], episode_id = prefix.split(self.episode_separator)
fielddata['episode']['code'] = int(episode_id)
fielddata['rank'] = 'episode'
del components[0]
score = score + 2.0
except:
pass
# Check for sequence/block/shot/camera designations
if components:
fielddata['seq'] = {}
fielddata['seq']['code'] = components[0]
fielddata['rank'] = 'seq'
del components[0]
score = score + 1.0
if components:
try:
fielddata['block'] = {}
fielddata['block']['code'] = int(components[0])
del components[0]
fielddata['rank'] = 'block'
score = score + 1.0
except:
pass
if components and components[0].startswith('Cam'):
fielddata['camera'] = {}
fielddata['camera']['code'] = components[0][len('Cam'):]
fielddata['rank'] = 'camera'
del components[0]
score = score + 1.0
if components:
# Any remaining structure is joined back to make the shot ID
fielddata['shot'] = {}
fielddata['shot']['code'] = ''.join(components)
fielddata['rank'] = 'shot'
components = None
score = score + 1.0
if title and fielddata['rank'] in fielddata:
fielddata[fielddata['rank']]['title'] = title
return score/self.max_score, fielddata
DEFAULT_YAML = {}
with open(os.path.join(os.path.dirname(__file__), 'abx.yaml')) as def_yaml_file:
DEFAULT_YAML.update(yaml.safe_load(def_yaml_file))
@registered_parser
class Parser_ABX_Fallback(object):
"""
Highly-tolerant parser to fall back on if others fail.
The fallback parser makes only a very minimal and robust set of assumptions.
Any legal filename will successfully return a simple parse, though much
interpretation may be lost. It still allows for common field-based practices,
but falls back on using the unaltered filename if necessary.
YAML options available:
---
definitions:
parser: abx_fallback # Force use of this parser.
There are no other options. Field separators are defined very broadly,
and include most non-word characters (~#$!=+&_-). This was mostly designed
to work without a project schema available.
"""
name = 'abx_fallback'
filetypes = DEFAULT_YAML['definitions']['filetypes']
roles = DEFAULT_YAML['definitions']['roles']
roles_by_filetype = (
DEFAULT_YAML['definitions']['roles_by_filetype'])
main_sep_re = re.compile(r'\W+') # Any single non-word char
comment_sep_re = re.compile(r'[\W_][\W_]+|[~#$!=+&]+')
def __init__(self, **kwargs):
pass
def _parse_ending(self, filename, separator):
try:
remainder, suffix = filename.rsplit(separator, 1)
score = 1.0
except ValueError:
remainder = filename
suffix = None
score = 0.0
return (suffix, remainder, score)
def __call__(self, filename, namepath):
fields = {}
score = 1.0
possible = 4.5
split = filename.rsplit('.', 1)
if len(split)<2 or split[1] not in self.filetypes:
fields['filetype'] = None
remainder = filename
score += 1.0
else:
fields['filetype'] = split[1]
remainder = split[0]
comment_match = self.comment_sep_re.search(remainder)
if comment_match:
fields['comment'] = remainder[comment_match.end():]
remainder = remainder[:comment_match.start()]
else:
fields['comment'] = None
role = self.main_sep_re.split(remainder)[-1]
if role in self.roles:
fields['role'] = role
remainder = remainder[:-1-len(role)]
score += 1.0
else:
fields['role'] = None
# Implied role
if fields['filetype'] in self.roles_by_filetype:
fields['role'] = self.roles_by_filetype[fields['filetype']]
score += 1.0
words = self.main_sep_re.split(remainder)
fields['code'] = ''.join([w.capitalize() for w in words])
fields['title'] = remainder
return score/possible, fields
@registered_parser
class Parser_ABX_Schema(object):
"""
Parser based on using the list of schemas.
The schemas are normally defined in the project root directory YAML.
Expands on the 'abx_episode' parser by allowing all the schema to
be defined by outside configuration data (generally provided in a
project YAML file, but this module does not depend on the data
source used).
The project YAML can additionally control parsing with this parser:
---
definitions:
parser: abx_schema # Force use of this parser
parser_options: # Set parameters
filetype_separator: '.'
comment_separator: '--'
role_separator: '-'
title_separator: '-'
filetypes: # Recognized filetypes.
blend: Blender File # <filetype>: documentation
...
roles: # Recognized role fields.
anim: Character Animation # <role>: documentation
...
roles_by_filetype: # Roles implied by filetype.
kdenlive: edit # <filetype>:<role>
...
(For the full default lists see abx/abx.yaml).
schemas (list): The current schema-list defining how filenames should be parsed.
This "Schema" parser uses this to determine both parsing and
mapping of text fields in the filename.
definitions(dict): The project definitions currently visible to the parser.
"""
name = 'abx_schema'
def __init__(self, schemas=None, definitions=None,
filetype_separator = '.',
comment_separator = '--',
role_separator = '-',
title_separator = '-',
**kwargs):
self.filetype_separator = filetype_separator
self.comment_separator = comment_separator
self.role_separator = role_separator
self.title_separator = title_separator
self.schemas = schemas
if 'roles' in definitions:
self.roles = definitions['roles']
else:
self.roles = []
if 'filetypes' in definitions:
self.filetypes = definitions['filetypes']
else:
self.filetypes = []
if 'roles_by_filetype' in definitions:
self.roles_by_filetype = definitions['roles_by_filetype']
else:
self.roles_by_filetype = []
def _parse_ending(self, filename, separator):
try:
remainder, suffix = filename.rsplit(separator, 1)
score = 1.0
except ValueError:
remainder = filename
suffix = None
score = 0.0
return (suffix, remainder, score)
def _parse_beginning(self, filename, separator):
try:
prefix, remainder = filename.split(separator, 1)
score = 1.0
except ValueError:
prefix = filename
remainder = ''
score = 0.0
return (prefix, remainder, score)
def __call__ (self, filename, namepath, debug=False):
fields = {}
score = 0.0
possible = 0.0
# First get specially-handled extensions
remainder = filename
field, newremainder, s = self._parse_ending(remainder, self.filetype_separator)
if field and field in self.filetypes:
remainder = newremainder
fields['filetype'] = field
score += s*1.0
else:
fields['filetype'] = None
field, remainder, s = self._parse_ending(remainder, self.comment_separator)
fields['comment'] = field
score += s*0.5
field, newremainder, s = self._parse_ending(remainder, self.role_separator)
if field and field in self.roles:
remainder = newremainder
fields['role'] = field
score += s*0.5
else:
fields['role'] = None
field, remainder, s = self._parse_ending(remainder, self.title_separator)
fields['title'] = field
score += s*0.5
possible += 3.0
# Implicit roles
if ( not fields['role'] and
fields['filetype'] and
fields['role'] in self.roles_by_filetype):
self.role = self.roles_by_filetype[fields['filetype']]
score += 0.2
#possible += 0.2
# Figure the rest out from the schema
# Find the matching rank start position for the filename
start = 0
for start, (schema, name) in enumerate(zip(self.schemas, namepath)):
field, r, s = self._parse_beginning(remainder, schema.delimiter)
try:
if field.lower() == schema.format.format(name).lower():
score += 1.0
break
except ValueError:
print(' (365) field, format', field, schema.format)
possible += 1.0
# Starting from that position, try to match fields
# up to the end of the namepath (checking against it)
irank = 0
for irank, (schema, name) in enumerate(
zip(self.schemas[start:], namepath[start:])):
if not remainder: break
field, remainder, s = self._parse_beginning(remainder, schema.delimiter)
score += s
try:
if ( type(field) == str and
field.lower() == schema.format.format(name).lower()):
fields[schema.rank]={'code':field}
fields['rank'] = schema.rank
score += 1.0
except ValueError:
print(' (384) field, format', field, schema.format)
possible += 2.0
# Remaining fields are authoritative (doesn't affect score)
for schema in self.schemas[irank:]:
if not remainder: break
field, remainder, s = self._parse_beginning(remainder, schema.delimiter)
fields[schema.rank]={'code':field}
fields['rank'] = schema.rank
if 'rank' in fields:
fields[fields['rank']]['title'] = fields['title']
if not fields['role'] and fields['filetype'] in self.roles_by_filetype:
fields['role'] = self.roles_by_filetype[fields['filetype']]
return score/possible, fields