I am trying to programmatically ingest ("reflect") Google style docstrings. I am using sphinx.ext.napoleon
, as seemingly not many tools do this. I am following this example with the below function:
from sphinx.ext.napoleon import Config, GoogleDocstring
def foo(arg: int | None = 5) -> None:
"""Stub summary.
Args:
arg(int): Optional integer defaulted to 5.
"""
docstring = GoogleDocstring(foo.__doc__)
print(docstring)
However, my usage doesn't automagically convert the printed output to reST style like the Sphinx example does.
So this leads me to my question. How can one programmatically ingest the summary, extended description, arg names, and arg descriptions from a Google Style docstring? Ideally they are converted into some sort of data structure (e.g. dict
or dataclass
).
Instead, you can try to use built-in inspect
module to get the docstring, like this:
import inspect
docstring = GoogleDocstring(inspect.getdoc(foo))
print(docstring)
This will print in following format:
Stub summary.
:param arg: Optional integer defaulted to 5.
:type arg: int
The difference between inspect.getdoc(foo)
and foo.__doc__
seems to be indentation:
print(foo.__doc__)
Stub summary.
Args:
arg(int): Optional integer defaulted to 5.
print(inspect.getdoc(foo))
Stub summary.
Args:
arg(int): Optional integer defaulted to 5.
To use __doc__
attribute, you can apply prepare_docstring
function, like this:
from sphinx.util.docstrings import prepare_docstring
docstring = GoogleDocstring(prepare_docstring(foo.__doc__))
print(docstring)
Then, you can either write your own parser or use 3rd party libraries, like doctrans, docstring_parser, etc. For sake of example and simplicity, I've taken the solution below from doctrans source. Since, it supports more than required, as well as, I didn't want to install and pollute the system, thus, I've just used the code directly:
import re
import sys
PARAM_OR_RETURNS_REGEX = re.compile(":(?:param|returns?)")
RETURNS_REGEX = re.compile(":returns?: (?P<doc>.*)", re.DOTALL)
PARAM_REGEX = re.compile(
r":param (?P<name>[\*\w]+): (?P<doc>.*?)"
r"(?:(?=:param)|(?=:return)|(?=:raises)|\Z)",
re.DOTALL,
)
def trim(docstring):
"""Trim function from PEP-257."""
if not docstring:
return ""
# Convert tabs to spaces (following the normal Python rules)
# and split into a list of lines:
lines = docstring.expandtabs().splitlines()
# Determine minimum indentation (first line doesn't count):
indent = sys.maxsize
for line in lines[1:]:
stripped = line.lstrip()
if stripped:
indent = min(indent, len(line) - len(stripped))
# Remove indentation (first line is special):
trimmed = [lines[0].strip()]
if indent < sys.maxsize:
for line in lines[1:]:
trimmed.append(line[indent:].rstrip())
# Strip off trailing and leading blank lines:
while trimmed and not trimmed[-1]:
trimmed.pop()
while trimmed and not trimmed[0]:
trimmed.pop(0)
# Current code/unittests expects a line return at
# end of multiline docstrings
# workaround expected behavior from unittests
if "\n" in docstring:
trimmed.append("")
# Return a single string:
return "\n".join(trimmed)
def reindent(string):
return "\n".join(line.strip() for line in string.strip().split("\n"))
def doc_to_type_doc(name, doc):
doc = trim(doc).splitlines()
docs, typ = [], []
for line in doc:
if line.startswith(":type"):
line = line[len(":type ") :]
colon_at = line.find(":")
found_name = line[:colon_at]
assert name == found_name, f"{name!r} != {found_name!r}"
line = line[colon_at + 2 :]
typ.append(
line[3:-3] if line.startswith("```") and line.endswith("```") else line
)
elif len(typ):
typ.append(line)
else:
docs.append(line)
return dict(doc="\n".join(docs), **{"typ": "\n".join(typ)} if len(typ) else {})
def parse_docstring(docstring):
"""Parse the docstring into its components.
:returns: a dictionary of form
{
'short_description': ...,
'long_description': ...,
'params': [{'name': ..., 'doc': ..., 'typ': ...}, ...],
"returns': {'name': ..., 'typ': ...}
}
"""
short_description = long_description = returns = ""
params = []
if docstring:
docstring = trim(docstring.lstrip("\n"))
lines = docstring.split("\n", 1)
short_description = lines[0]
if len(lines) > 1:
long_description = lines[1].strip()
params_returns_desc = None
match = PARAM_OR_RETURNS_REGEX.search(long_description)
if match:
long_desc_end = match.start()
params_returns_desc = long_description[long_desc_end:].strip()
long_description = long_description[:long_desc_end].rstrip()
if params_returns_desc:
params = [
dict(name=name, **doc_to_type_doc(name, doc))
for name, doc in PARAM_REGEX.findall(params_returns_desc)
]
match = RETURNS_REGEX.search(params_returns_desc)
if match:
returns = reindent(match.group("doc"))
if returns:
r_dict = {"name": ""}
for idx, char in enumerate(returns):
if char == ":":
r_dict["typ"] = returns[idx + len(":rtype:") :].strip()
if r_dict["typ"].startswith("```") and r_dict[
"typ"
].endswith("```"):
r_dict["typ"] = r_dict["typ"][3:-3]
break
r_dict["name"] += char
r_dict["name"] = r_dict["name"].rstrip()
returns = r_dict
return {
"short_description": short_description,
"long_description": long_description,
"params": params,
"returns": returns,
}
parse_docstring("\n".join(docstring.lines()))