¿Cómo obtener la fuente correspondiente a un nodo AST de Python?

Los nodos AST de Python tienen lineno y col_offset , que indican el comienzo del rango de código respectivo. ¿Hay una manera fácil de obtener también el final del rango de código? Una biblioteca de terceros?

EDITAR: el último código (probado en Python 3.5-3.7) está aquí: https://bitbucket.org/plas/thonny/src/master/thonny/ast_utils.py

Como no encontré una manera fácil, aquí hay una manera difícil (y probablemente no óptima). Puede fallar y / o funcionar incorrectamente si hay más errores de lineno / col_offset en el analizador de Python que los mencionados (y resueltos) en el código. Probado en Python 3.3:

 def mark_code_ranges(node, source): """ Node is an AST, source is corresponding source as string. Function adds recursively attributes end_lineno and end_col_offset to each node which has attributes lineno and col_offset. """ NON_VALUE_KEYWORDS = set(keyword.kwlist) - {'False', 'True', 'None'} def _get_ordered_child_nodes(node): if isinstance(node, ast.Dict): children = [] for i in range(len(node.keys)): children.append(node.keys[i]) children.append(node.values[i]) return children elif isinstance(node, ast.Call): children = [node.func] + node.args for kw in node.keywords: children.append(kw.value) if node.starargs != None: children.append(node.starargs) if node.kwargs != None: children.append(node.kwargs) children.sort(key=lambda x: (x.lineno, x.col_offset)) return children else: return ast.iter_child_nodes(node) def _fix_triple_quote_positions(root, all_tokens): """ http://bugs.python.org/issue18370 """ string_tokens = list(filter(lambda tok: tok.type == token.STRING, all_tokens)) def _fix_str_nodes(node): if isinstance(node, ast.Str): tok = string_tokens.pop(0) node.lineno, node.col_offset = tok.start for child in _get_ordered_child_nodes(node): _fix_str_nodes(child) _fix_str_nodes(root) # fix their erroneous Expr parents for node in ast.walk(root): if ((isinstance(node, ast.Expr) or isinstance(node, ast.Attribute)) and isinstance(node.value, ast.Str)): node.lineno, node.col_offset = node.value.lineno, node.value.col_offset def _fix_binop_positions(node): """ http://bugs.python.org/issue18374 """ for child in ast.iter_child_nodes(node): _fix_binop_positions(child) if isinstance(node, ast.BinOp): node.lineno = node.left.lineno node.col_offset = node.left.col_offset def _extract_tokens(tokens, lineno, col_offset, end_lineno, end_col_offset): return list(filter((lambda tok: tok.start[0] >= lineno and (tok.start[1] >= col_offset or tok.start[0] > lineno) and tok.end[0] <= end_lineno and (tok.end[1] <= end_col_offset or tok.end[0] < end_lineno) and tok.string != ''), tokens)) def _mark_code_ranges_rec(node, tokens, prelim_end_lineno, prelim_end_col_offset): """ Returns the earliest starting position found in given tree, this is convenient for internal handling of the siblings """ # set end markers to this node if "lineno" in node._attributes and "col_offset" in node._attributes: tokens = _extract_tokens(tokens, node.lineno, node.col_offset, prelim_end_lineno, prelim_end_col_offset) #tokens = _set_real_end(node, tokens, prelim_end_lineno, prelim_end_col_offset) # mark its children, starting from last one # NB! need to sort children because eg. in dict literal all keys come first and then all values children = list(_get_ordered_child_nodes(node)) for child in reversed(children): (prelim_end_lineno, prelim_end_col_offset) = \ _mark_code_ranges_rec(child, tokens, prelim_end_lineno, prelim_end_col_offset) if "lineno" in node._attributes and "col_offset" in node._attributes: # new "front" is beginning of this node prelim_end_lineno = node.lineno prelim_end_col_offset = node.col_offset return (prelim_end_lineno, prelim_end_col_offset) def _strip_trailing_junk_from_expressions(tokens): while (tokens[-1].type not in (token.RBRACE, token.RPAR, token.RSQB, token.NAME, token.NUMBER, token.STRING, token.ELLIPSIS) and tokens[-1].string not in ")}]" or tokens[-1].string in NON_VALUE_KEYWORDS): del tokens[-1] def _strip_trailing_extra_closers(tokens, remove_naked_comma): level = 0 for i in range(len(tokens)): if tokens[i].string in "({[": level += 1 elif tokens[i].string in ")}]": level -= 1 if level == 0 and tokens[i].string == "," and remove_naked_comma: tokens[:] = tokens[0:i] return if level < 0: tokens[:] = tokens[0:i] return def _set_real_end(node, tokens, prelim_end_lineno, prelim_end_col_offset): # prelim_end_lineno and prelim_end_col_offset are the start of # next positioned node or end of source, ie. the suffix of given # range may contain keywords, commas and other stuff not belonging to current node # Function returns the list of tokens which cover all its children if isinstance(node, _ast.stmt): # remove empty trailing lines while (tokens[-1].type in (tokenize.NL, tokenize.COMMENT, token.NEWLINE, token.INDENT) or tokens[-1].string in (":", "else", "elif", "finally", "except")): del tokens[-1] else: _strip_trailing_extra_closers(tokens, not isinstance(node, ast.Tuple)) _strip_trailing_junk_from_expressions(tokens) # set the end markers of this node node.end_lineno = tokens[-1].end[0] node.end_col_offset = tokens[-1].end[1] # Try to peel off more tokens to give better estimate for children # Empty parens would confuse the children of no argument Call if ((isinstance(node, ast.Call)) and not (node.args or node.keywords or node.starargs or node.kwargs)): assert tokens[-1].string == ')' del tokens[-1] _strip_trailing_junk_from_expressions(tokens) # attribute name would confuse the "value" of Attribute elif isinstance(node, ast.Attribute): if tokens[-1].type == token.NAME: del tokens[-1] _strip_trailing_junk_from_expressions(tokens) else: raise AssertionError("Expected token.NAME, got " + str(tokens[-1])) #import sys #print("Expected token.NAME, got " + str(tokens[-1]), file=sys.stderr) return tokens all_tokens = list(tokenize.tokenize(io.BytesIO(source.encode('utf-8')).readline)) _fix_triple_quote_positions(node, all_tokens) _fix_binop_positions(node) source_lines = source.split("\n") prelim_end_lineno = len(source_lines) prelim_end_col_offset = len(source_lines[len(source_lines)-1]) _mark_code_ranges_rec(node, all_tokens, prelim_end_lineno, prelim_end_col_offset) 

Teníamos una necesidad similar, y creé la biblioteca asttokens para este propósito. Mantiene la fuente tanto en forma de texto como en forma de token, y marca los nodos AST con información de token, desde donde el texto también está disponible.

Funciona con Python 2 y 3 (probado con 2.7 y 3.5). Por ejemplo:

 import ast, asttokens st=''' def greet(a): say("hello") if a else say("bye") ''' atok = asttokens.ASTTokens(st, parse=True) for node in ast.walk(atok.tree): if hasattr(node, 'lineno'): print atok.get_text_range(node), node.__class__.__name__, atok.get_text(node) 

Huellas dactilares

 (1, 50) FunctionDef def greet(a): say("hello") if a else say("bye") (17, 50) Expr say("hello") if a else say("bye") (11, 12) Name a (17, 50) IfExp say("hello") if a else say("bye") (33, 34) Name a (17, 29) Call say("hello") (40, 50) Call say("bye") (17, 20) Name say (21, 28) Str "hello" (40, 43) Name say (44, 49) Str "bye" 

Hola, sé que es muy tarde, pero creo que esto es lo que está buscando, estoy haciendo el análisis solo para las definiciones de funciones en el módulo. Podemos obtener la primera y última línea del nodo ast mediante este método. De esta manera, las líneas de código fuente de una definición de función se pueden obtener analizando el archivo fuente leyendo solo las líneas que necesitamos. Este es un ejemplo muy simple,

 st='def foo():\n print "hello" \n\ndef bla():\na = 1\nb = 2\nc= a+b\n print c' import ast tree = ast.parse(st) for function in tree.body: if isinstance(function,ast.FunctionDef): # Just in case if there are loops in the definition lastBody = func.body[-1] while isinstance (lastBody,(ast.For,ast.While,ast.If)): lastBody = lastBody.Body[-1] lastLine = lastBody.lineno print "Name of the function is ",function.name print "firstLine of the function is ",function.lineno print "LastLine of the function is ",lastLine print "the source lines are " if isinstance(st,str): st = st.split("\n") for i , line in enumerate(st,1): if i in range(function.lineno,lastLine+1): print line