# -*- coding: utf-8 -*-
import re
import os
import pygments
from pygments import lexers, formatters
from markdown import markdown
from .. import markdown_extensions
from ..utils import cached_property
from .utils import Section, ParsingStrategy, iterate_sections,\
split_section_by_regex, split_code_by_pos
This class governs all source file parsing routines. Due to differences in programming languages, an extensible parsing strategy
is required (see utils.py#parsing-strategy)
class Language(object):
extensions = []
scope_keywords = []
filename_substitutes = {}
markdown_extensions = [
markdown_extensions.LinesConnector(),
markdown_extensions.SaneDefList(),
markdown_extensions.Todo(),
markdown_extensions.Pydoc(),
markdown_extensions.AutoLinkExtension(),
markdown_extensions.MathExtension(),
"markdown.extensions.def_list",
"markdown.extensions.fenced_code",
'markdown.extensions.codehilite',
'markdown.extensions.tables',
Pygments lexer corresponding to the language
@cached_property
def lexer(self):
return lexers.get_lexer_by_name(self.name.lower())
Use pygments to highlight the code
def highlight(self, code, formatter="html"):
return pygments.highlight(
code, self.lexer,
formatters.get_formatter_by_name(formatter)
)
Filename transformation according to language specifics. If filename_substitutes
are defined, the filename can be replaced accordingly. For example, Python
module's __init__.py
corresponds to the index file of the folder and should be turned into index.html
If no filename_substitutes
declared, the filename extension will be replaced by .html
.
def transform_filename(self, filename):
if filename in self.filename_substitutes:
return self.filename_substitutes[filename]
for extension in self.extensions:
if filename.endswith(extension):
return filename + ".html"
return filename
Language parsing strategy - i.e., a list of methods to be applied to the code to derive a properly formatter set of docs-code sections
def strategy(self):
return ParsingStrategy(self.set_sections_levels, self.strip_docs_indentation,
self.set_sections_levels, self.merge_up,
self.set_sections_levels, self.merge_down,
self.set_sections_levels, self.absorb)
Apply self.strategy()
to the code
def parse(self, code, add_lineno=True):
sections = [Section(code_text=code)]
for method in self.strategy():
sections = method(sections)
Strip empty sections
sections = [section for section in sections if section.has_code() or section.has_docs()]
return sections
@iterate_sections(start=0)
def set_sections_levels(self, sections, i):
if sections[i]["code_text"]:
indent = re.match(r"^([ \t]*)", sections[i]["code_text"]).group(1)
sections[i]["level"] = len(indent)
elif i > 0:
sections[i]["level"] = sections[i-1]['level']
@iterate_sections(start=0)
def strip_docs_indentation(self, sections, i):
indent = re.match(r"^([ \t]*)", sections[i]["docs_text"], re.M).group(1)
sections[i]["docs_text"] = re.compile(r"^{0}".format(indent), re.M)\
.sub("", sections[i]["docs_text"])
Suck up the documentation added right under the scope-defining lines (e.g., class or function definition)
@iterate_sections()
def merge_up(self, sections, i):
if not sections[i].has_code() and not sections[i-1].has_docs() and sections[i-1].has_code():
prev_line = sections[i-1]["code_text"].strip().split("\n")[-1].strip()
If previous line of code contains one of the scope_keywords
- merge last 2 sections
if any([re.match(x, prev_line) for x in self.scope_keywords]):
sections[i-1]["docs_text"] = sections[i]["docs_text"]
sections[i:i + 1] = []
Merge the documentation placed above the code (just like the next comment)
@iterate_sections()
def merge_down(self, sections, i):
if there was no code, but were docs - merge
if not sections[i-1].has_code() and sections[i-1].has_docs()\
and not sections[i].has_docs() and sections[i].has_code():
sections[i-1]["code_text"] = sections[i]["code_text"]
sections[i-1]["scope"] = sections[i]["scope"] or sections[i-1]["scope"]
sections[i:i+1] = []
Absorb next code-only section if it lies deeper than the current one (that has docs)
@iterate_sections()
def absorb(self, sections, i):
if not sections[i].has_docs() and sections[i]['level'] > sections[i-1]['level']:
sections[i-1]['code_text'] = sections[i-1]['code_text'].rstrip('\n') \
+ '\n\n' + sections[i]['code_text'].lstrip('\n')
sections[i:i+1] = []
class PlainText(Language):
extensions = ['.txt']
filename_substitutes = {
'index.txt': 'index.html'
}
divider_text = divider_html = ""
lexer = None
def highlight(self, code):
return code
Some languages might have comments that are parsed by the compiler/interpreter. They can be used to activate or deactivate special options, or for debugging purposes.
Usually those comments have specific patterns to distinguish them from ordinary comments, which are ignored by the compiler.
Property ignored_inline_patterns
is a list of regexps that
match the ignored comments for that language.
ignored_inline_patterns = []
def strategy(self):
base_strategy = super(InlineCommentLanguage, self).strategy()
base_strategy.insert(0, self.parse_inline)
return base_strategy
@cached_property
def inline_prefix(self):
return re.compile(r"^[ \t]*{0}".format(self.inline_delimiter), re.M)
Ignored comments, as defined above, are comments that are to be treated the same way as source code instead of documentation.
To treat them as normal code, we simply add a regular expression that matches whenever none of those patterns matches. This way, lines that match the pattern will be treated as code instead of documentation.
if self.ignored_inline_patterns:
Build a regexp that matches whenever none of the patterns matches. Only lines for which this regexp matches will be treated as documentation. Lines for which it doesn't match will be treated as code.
dont_match = r"(?!({0}))".\
format("|".join(pattern for pattern in self.ignored_inline_patterns))
If no ignored comment patterns have been defined for the current language, treat all comments as documentation.
else:
dont_match = ""
Whenever the text after the self.inline_delimiter
matches the dont_match
regexp,
treat the comment as documentation.
return re.compile(r"((?:^[ \t]*{0}{1}.*\n)+)".format(self.inline_delimiter,
dont_match),
flags=re.M)
The dividing token we feed into Pygments, to delimit the boundaries between sections.
@property
def divider_text(self):
return "\n{0}DIVIDER\n".format(self.inline_delimiter)
The mirror of divider_text
that we expect Pygments to return. We can split on this to recover the original sections.
@property
def divider_html(self):
return re.compile(r'\n*<span class="c[1]?">{0}DIVIDER</span>\n*'
.format(self.inline_delimiter))
@iterate_sections(start=0)
def parse_inline(self, sections, i):
new_sections = split_section_by_regex(sections[i], self.inline_re)
for j, section in enumerate(new_sections):
if section.get("meta") != "stripped":
new_sections[j]["docs_text"] = self.inline_prefix.sub("",
new_sections[j]["docs_text"])
new_sections[j]["meta"] = "stripped"
sections[i:i+1] = new_sections
Language mixin for multiline comments. Some languages also have another syntax entity called "docblocks" - they probably should be treated separately, although they are usually captured along with multiline comments.
class MultilineCommentLanguage(Language):
multistart = '"""'
multiend = '"""'
@property
def multiline_re(self):
return re.compile(r'^(\s*{start}((?!{end})[\s\S])*){end}'
.format(start=self.multistart, end=self.multiend), flags=re.M)
def strategy(self):
base_strategy = super(MultilineCommentLanguage, self).strategy()
base_strategy.insert(0, self.parse_multiline)
return base_strategy
@iterate_sections(start=0)
def parse_multiline(self, sections, i):
sections[i:i+1] = split_section_by_regex(sections[i], self.multiline_re,
meta="stripped")
sections[i]["docs_text"] = re.sub(r"^\n*(\s*){0}".format(self.multistart),
r"\1",
sections[i]["docs_text"])
In indent-based languages it is quite easy to find a proper place to split the code section: basically, whenever an indent of the line becomes smaller, than the indent of the first line of the section - split up.
TODO: Consider using some preprocessor instead of literal matching of the indentation. For example, https://github.com/sirthias/parboiled/wiki/Indentation-Based-Grammars, https://github.com/Cirru/cirru-parser
class IndentBasedLanguage(Language):
@iterate_sections(start=0)
def split_by_scopes(self, sections, i):
indent = re.match(r"^(\s*)", sections[i]["code_text"].strip("\n")).group(1)
regex = re.compile(r"^(\s{{0,{0}}}\S)".format(len(indent) - 1), flags=re.M)
match = regex.search(sections[i]["code_text"], pos=len(indent) + 1)
if match:
sections = split_code_by_pos(i, match.start(), sections)
sections[i]['level'] = len(indent)
sections[i+1]['level'] = len(match.group(1).strip("\n"))
regex = re.compile(r"({0})".format("|".join(self.scope_keywords)), flags=re.M)
match = regex.search(sections[i]["code_text"])
if match and match.start() == 0:
sections[i]['scope'] = match.group(1).strip()
match = regex.search(sections[i]["code_text"], pos=match.start() + 1)
if match:
sections = split_code_by_pos(i, match.start(), sections)
sections[i+1]['code_text'] = sections[i+1]['code_text'].strip('\n')
sections[i+1]['scope'] = match.group(1).strip()
def strategy(self):
base_strategy = super(IndentBasedLanguage, self).strategy()
base_strategy.insert_before('merge_up', self.split_by_scopes)
return base_strategy
Split the code sections by scope_keywords
of the language
TODO: consider splitting also by braces interiors
@iterate_sections(start=0)
def split_by_scopes(self, sections, i):
regex = re.compile(r"^({0})".format("|".join(self.scope_keywords)), flags=re.M)
match = regex.search(sections[i]["code_text"])
if match and match.start() == 0:
match = regex.search(sections[i]["code_text"], pos=match.start() + 1)
if match:
split_code_by_pos(i, match.start(), sections)
sections[i+1]['code_text'] = sections[i+1]['code_text'].strip('\n')
sections[i+1]['scope'] = match.group(2)
def strategy(self):
base_strategy = super(BraceBasedLanguage, self).strategy()
base_strategy.insert_before('merge_up', self.split_by_scopes)
return base_strategy
Styling of the C/C++ code is largely historical and oriented on reading hopelessly long codes on the old terminal screens.
TODO: detect whole style docs blocks, for example, boxes:
//////////////////////////// ////// Nice comment! /////// //////////////////////////// /*************************** **** We love ASCII-art! **** ***************************/
class C(BraceBasedLanguage, InlineCommentLanguage, MultilineCommentLanguage):
extensions = [".c", ".cpp", ".h", ".cc"]
inline_delimiter = "//"
multistart = r"/\*+"
multiend = r"\*+/"
scope_keywords = [r"\s*({0}) \w+".format(word) for word in
["class", "function", "namespace",
"public", "private", "protected", "abstract", "inline", "virtual",
"void", "int", "double", "bool", "float"]]
FIXME: this redefinition brakes the whole extension
def __init__(self, *args, **kwargs): super(C, self).__init__(*args, **kwargs) for i, ext in enumerate(self.markdown_extensions): if isinstance(ext, markdown_extensions.LineConnector): self.markdown_extensions[i] = \ markdown_extensions.LineConnector(regex=r"([\w\.])[ \t]*\n[ \t]*(\w)")
@iterate_sections(start=0)
def strip_commenting_design(self, sections, i):
sections[i]["docs_text"] = re.compile(r"^[ \t]*(\/+|\*+)(.*)$", re.M)\
.sub(r"\2", sections[i]["docs_text"])
def strategy(self):
base_strategy = super(C, self).strategy()
base_strategy.insert_before('strip_docs_indentation', self.strip_commenting_design)
base_strategy.insert_after('strip_docs_indentation', self.split_by_scopes)
return base_strategy
JavaScript is largely identical to C/C++, although it has far less scope keywords and far more flexibility in defining functions and objects.
TODO: test function-defining scopes
function name(args) { ... } name = function(args) { ... }
class JavaScript(C):
extensions = [".js"]
scope_keywords = [r"\s*(class)", r"^.*(function)[ \t]*\("]
class PHP(C):
extensions = [".php"]
def strategy(self):
base_strategy = super(PHP, self).strategy()
base_strategy.delete('merge_up')
return base_strategy
Obviously, Python language parsing is a best-developed part of Pyccoon.
TODO: support also '''
comment delimiters.
class Python(IndentBasedLanguage, MultilineCommentLanguage, InlineCommentLanguage):
extensions = [".py", ".pyx"]
inline_delimiter = "#"
ignored_inline_patterns = [
__init__.py
files can perfectly serve as modules index files.
filename_substitutes = {
"__init__.py": "index.html"
}
scope_keywords = [r"^\s*(def) ", r"^\s*(class) ", r"^\s*(@)\w+"]
def strategy(self):
base_strategy = super(Python, self).strategy()
base_strategy.insert_before('absorb', self.python_absorb)
return base_strategy
Python decorators are the tricky part of proper parsing of the source file into sections of docs and code.
Whenever a decorator section occurs, it should be merged not into the previous sections, but into the next.
@iterate_sections()
def python_absorb(self, sections, i):
if '@' in sections[i-1]['scope']:
sections[i]['docs_text'] = sections[i-1]['docs_text'] + sections[i]['docs_text']
sections[i]['code_text'] = sections[i-1]['code_text'] + sections[i]['code_text']
sections[i-1:i+1] = [sections[i]]
return i
class Fortran(IndentBasedLanguage, MultilineCommentLanguage, InlineCommentLanguage):
extensions = [".f", ".f90"]
inline_delimiter = "!"
multistart = r'^[Cc]'
multiend = r'$'
scope_keywords = [r"^\s*(subroutine) ", r"^\s*(function)", r"^\s*(module)", r"^\s*(program)"]
Mostly identical to Python.
TODO: Actually, Ruby is crazy and supports unbelievable variety of multiline comment syntaxes:
multistart = ["=begin", "<<-DOC", "\"", "__END__"] multiend = ["=end", "DOC", "\"", ""]
will have to rethink multiline comments capturing to support them all
class Ruby(IndentBasedLanguage, InlineCommentLanguage, MultilineCommentLanguage):
extensions = [".rb"]
inline_delimiter = "#"
multistart = "=begin"
multiend = "=end"
scope_keywords = [r"^\s*(module) ", r"^\s*(class) ", r"^\s*(def) "]
class CoffeScript(InlineCommentLanguage, MultilineCommentLanguage): extensions = [".coffee"] name = "Coffee-Script" inline_delimiter = "#" multistart = "###" multiend = "###" class Perl(InlineCommentLanguage): extensions = [".pl"] inline_delimiter = "#" class SQL(InlineCommentLanguage): extensions = [".sql"] inline_delimiter = "--" class Scheme(InlineCommentLanguage, MultilineCommentLanguage): extensions = [".scm"] inline_delimiter = ";;" multistart = "#|" multiend = "|#" class Lua(InlineCommentLanguage, MultilineCommentLanguage): extensions = [".lua"] inline_delimiter = "--" multistart = "--[[" multiend = "--]]" class Erlang(InlineCommentLanguage): extensions = [".erl"] inline_delimiter = "%%" class Tcl(InlineCommentLanguage): extensions = [".tcl"] inline_delimiter = "#" class Haskell(InlineCommentLanguage, MultilineCommentLanguage): extensions = [".hs"] inline_delimiter = "--" multistart = "{-" multiend = "-}" languages = [CoffeScript, Perl, SQL, C, PHP, JavaScript, Ruby, Python, Scheme, Lua, Erlang, Tcl, Haskell]
extensions_mapping = {}
languages = [Markdown, Python, Fortran, PHP, C, JavaScript, Ruby]
for language in languages:
instance = language()
for extension in instance.extensions:
extensions_mapping[extension] = instance
Get the current language we're documenting, based on the extension.
def get_language(source, code, language=None):
if language is not None:
for l in extensions_mapping.values():
if l.name == language:
return l
else:
raise ValueError("Unknown forced language: " + language)
m = re.match(r'.*(\..+)', os.path.basename(source))
if m and m.group(1) in extensions_mapping:
return extensions_mapping[m.group(1)]
else:
try:
lang = lexers.guess_lexer(code).name.lower()
except Exception:
return None
for l in extensions_mapping.values():
if l.name == lang:
return l
else:
return None