javadoc.py 9.8 KB
#
# javadoc.py: javadoc docstring parsing
# Edward Loper
#
# Created [07/03/03 12:37 PM]
# $Id: javadoc.py 1574 2007-03-07 02:55:14Z dvarrazzo $
#

"""
Epydoc parser for U{Javadoc<http://java.sun.com/j2se/javadoc/>}
docstrings.  Javadoc is an HTML-based markup language that was
developed for documenting Java APIs with inline comments.  It consists
of raw HTML, augmented by Javadoc tags.  There are two types of
Javadoc tag:

  - X{Javadoc block tags} correspond to Epydoc fields.  They are
    marked by starting a line with a string of the form \"C{@M{tag}
    [M{arg}]}\", where C{M{tag}} indicates the type of block, and
    C{M{arg}} is an optional argument.  (For fields that take
    arguments, Javadoc assumes that the single word immediately
    following the tag is an argument; multi-word arguments cannot be
    used with javadoc.)  
  
  - X{inline Javadoc tags} are used for inline markup.  In particular,
    epydoc uses them for crossreference links between documentation.
    Inline tags may appear anywhere in the text, and have the form
    \"C{{@M{tag} M{[args...]}}}\", where C{M{tag}} indicates the
    type of inline markup, and C{M{args}} are optional arguments.

Epydoc supports all Javadoc tags, I{except}:
  - C{{@docRoot}}, which gives the (relative) URL of the generated
    documentation's root.
  - C{{@inheritDoc}}, which copies the documentation of the nearest
    overridden object.  This can be used to combine the documentation
    of the overridden object with the documentation of the
    overridding object.
  - C{@serial}, C{@serialField}, and C{@serialData} which describe the
    serialization (pickling) of an object.
  - C{{@value}}, which copies the value of a constant.

@warning: Epydoc only supports HTML output for Javadoc docstrings.
"""
__docformat__ = 'epytext en'

# Imports
import re
from xml.dom.minidom import *
from epydoc.markup import *

def parse_docstring(docstring, errors, **options):
    """
    Parse the given docstring, which is formatted using Javadoc; and
    return a C{ParsedDocstring} representation of its contents.
    @param docstring: The docstring to parse
    @type docstring: C{string}
    @param errors: A list where any errors generated during parsing
        will be stored.
    @type errors: C{list} of L{ParseError}
    @param options: Extra options.  Unknown options are ignored.
        Currently, no extra options are defined.
    @rtype: L{ParsedDocstring}
    """
    return ParsedJavadocDocstring(docstring, errors)

class ParsedJavadocDocstring(ParsedDocstring):
    """
    An encoded version of a Javadoc docstring.  Since Javadoc is a
    fairly simple markup language, we don't do any processing in
    advance; instead, we wait to split fields or resolve
    crossreference links until we need to.

    @group Field Splitting: split_fields, _ARG_FIELDS, _FIELD_RE
    @cvar _ARG_FIELDS: A list of the fields that take arguments.
        Since Javadoc doesn't mark arguments in any special way, we
        must consult this list to decide whether the first word of a
        field is an argument or not.
    @cvar _FIELD_RE: A regular expression used to search for Javadoc
        block tags.

    @group HTML Output: to_html, _LINK_SPLIT_RE, _LINK_RE
    @cvar _LINK_SPLIT_RE: A regular expression used to search for
        Javadoc inline tags.
    @cvar _LINK_RE: A regular expression used to process Javadoc
        inline tags.
    """
    def __init__(self, docstring, errors=None):
        """
        Create a new C{ParsedJavadocDocstring}.
        
        @param docstring: The docstring that should be used to
            construct this C{ParsedJavadocDocstring}.
        @type docstring: C{string}
        @param errors: A list where any errors generated during
            parsing will be stored.  If no list is given, then
            all errors are ignored.
        @type errors: C{list} of L{ParseError}
        """
        self._docstring = docstring
        if errors is None: errors = []
        self._check_links(errors)

    #////////////////////////////////////////////////////////////
    # Field Splitting
    #////////////////////////////////////////////////////////////

    _ARG_FIELDS = ('group variable var type cvariable cvar ivariable '+
                   'ivar param '+
                   'parameter arg argument raise raises exception '+
                   'except deffield newfield keyword kwarg kwparam').split()
    _FIELD_RE = re.compile(r'(^\s*\@\w+[\s$])', re.MULTILINE)
    
    # Inherit docs from ParsedDocstring.
    def split_fields(self, errors=None):

        # Split the docstring into an alternating list of field tags
        # and text (odd pieces are field tags).
        pieces = self._FIELD_RE.split(self._docstring)

        # The first piece is the description.
        descr = ParsedJavadocDocstring(pieces[0])

        # The remaining pieces are the block fields (alternating tags
        # and bodies; odd pieces are tags).
        fields = []
        for i in range(1, len(pieces)):
            if i%2 == 1:
                # Get the field tag.
                tag = pieces[i].strip()[1:]
            else:
                # Get the field argument (if appropriate).
                if tag in self._ARG_FIELDS:
                    subpieces = pieces[i].strip().split(None, 1)+['','']
                    (arg, body) = subpieces[:2]
                else:
                    (arg, body) = (None, pieces[i])

                # Special processing for @see fields, since Epydoc
                # allows unrestricted text in them, but Javadoc just
                # uses them for xref links:
                if tag == 'see' and body:
                    if body[0] in '"\'':
                        if body[-1] == body[0]: body = body[1:-1]
                    elif body[0] == '<': pass
                    else: body = '{@link %s}' % body

                # Construct the field.
                parsed_body = ParsedJavadocDocstring(body)
                fields.append(Field(tag, arg, parsed_body))

        if pieces[0].strip():
            return (descr, fields)
        else:
            return (None, fields)

    #////////////////////////////////////////////////////////////
    # HTML Output.
    #////////////////////////////////////////////////////////////

    _LINK_SPLIT_RE = re.compile(r'({@link(?:plain)?\s[^}]+})')
    _LINK_RE = re.compile(r'{@link(?:plain)?\s+' + r'([\w#.]+)' +
                          r'(?:\([^\)]*\))?' + r'(\s+.*)?' + r'}')

    # Inherit docs from ParsedDocstring.
    def to_html(self, docstring_linker, **options):
        # Split the docstring into an alternating list of HTML and
        # links (odd pieces are links).
        pieces = self._LINK_SPLIT_RE.split(self._docstring)

        # This function is used to translate {@link ...}s to HTML.
        translate_xref = docstring_linker.translate_identifier_xref
        
        # Build up the HTML string from the pieces.  For HTML pieces
        # (even), just add it to html.  For link pieces (odd), use
        # docstring_linker to translate the crossreference link to
        # HTML for us.
        html = ''
        for i in range(len(pieces)):
            if i%2 == 0:
                html += pieces[i]
            else:
                # Decompose the link into pieces.
                m = self._LINK_RE.match(pieces[i])
                if m is None: continue # Error flagged by _check_links
                (target, name) = m.groups()

                # Normalize the target name.
                if target[0] == '#': target = target[1:]
                target = target.replace('#', '.')
                target = re.sub(r'\(.*\)', '', target)

                # Provide a name, if it wasn't specified.
                if name is None: name = target
                else: name = name.strip()

                # Use docstring_linker to convert the name to html.
                html += translate_xref(target, name)
        return html

    def _check_links(self, errors):
        """
        Make sure that all @{link}s are valid.  We need a separate
        method for ths because we want to do this at parse time, not
        html output time.  Any errors found are appended to C{errors}.
        """
        pieces = self._LINK_SPLIT_RE.split(self._docstring)
        linenum = 0
        for i in range(len(pieces)):
            if i%2 == 1 and not self._LINK_RE.match(pieces[i]):
                estr = 'Bad link %r' % pieces[i]
                errors.append(ParseError(estr, linenum, is_fatal=0))
            linenum += pieces[i].count('\n')

    #////////////////////////////////////////////////////////////
    # Plaintext Output.
    #////////////////////////////////////////////////////////////

    # Inherit docs from ParsedDocstring.  Since we don't define
    # to_latex, this is used when generating latex output.
    def to_plaintext(self, docstring_linker, **options):
        return self._docstring

    _SUMMARY_RE = re.compile(r'(\s*[\w\W]*?\.)(\s|$)')

    # Jeff's hack to get summary working
    def summary(self):
        # Drop tags
        doc = "\n".join([ row for row in self._docstring.split('\n')
                          if not row.lstrip().startswith('@') ])

        m = self._SUMMARY_RE.match(doc)
        if m:
            other = doc[m.end():]
            return (ParsedJavadocDocstring(m.group(1)),
                    other != '' and not other.isspace())
            
        else:
            parts = doc.strip('\n').split('\n', 1)
            if len(parts) == 1:
                summary = parts[0]
                other = False
            else:
                summary = parts[0] + '...'
                other = True
            
            return ParsedJavadocDocstring(summary), other
        
#     def concatenate(self, other):
#         if not isinstance(other, ParsedJavadocDocstring):
#             raise ValueError, 'Could not concatenate docstrings'
#         return ParsedJavadocDocstring(self._docstring+other._docstring)