pythonpython-2.7decodebittorrentbencoding

How to decode bencoded torrent data


I'm trying to extract size and name from a torrent file with decoding the content of a torrent file with bencode.

I did pip install bencode then I tested with one of the line of a torrent file as you can see there.

import bencode

blabla = 'd8:announce70:http://tracker.t411.io:56969/c5faa6720249d33ff6ba2af48640af89/announce7:comment29:https://www.t411.io/t/524280210:created by19:https://www.t411.io13:creation datei1431685353e4:infod6:lengthi14634059e4:name22:Charlie-Hebdo-1178.pdf12:piece lengthi262144e6:pieces1120:'
myprint = bencode.decode_string(blabla,1)
print myprint

This is the file that pip install put in the python lib:

from BTL import BTFailure


def decode_int(x, f):
    f += 1
    newf = x.index('e', f)
    n = int(x[f:newf])
    if x[f] == '-':
        if x[f + 1] == '0':
            raise ValueError
    elif x[f] == '0' and newf != f+1:
        raise ValueError
    return (n, newf+1)

def decode_string(x, f):
    colon = x.index(':', f)
    n = int(x[f:colon])
    if x[f] == '0' and colon != f+1:
        raise ValueError
    colon += 1
    return (x[colon:colon+n], colon+n)

def decode_list(x, f):
    r, f = [], f+1
    while x[f] != 'e':
        v, f = decode_func[x[f]](x, f)
        r.append(v)
    return (r, f + 1)

def decode_dict(x, f):
    r, f = {}, f+1
    while x[f] != 'e':
        k, f = decode_string(x, f)
        r[k], f = decode_func[x[f]](x, f)
    return (r, f + 1)

decode_func = {}
decode_func['l'] = decode_list
decode_func['d'] = decode_dict
decode_func['i'] = decode_int
decode_func['0'] = decode_string
decode_func['1'] = decode_string
decode_func['2'] = decode_string
decode_func['3'] = decode_string
decode_func['4'] = decode_string
decode_func['5'] = decode_string
decode_func['6'] = decode_string
decode_func['7'] = decode_string
decode_func['8'] = decode_string
decode_func['9'] = decode_string

def bdecode(x):
    try:
        r, l = decode_func[x[0]](x, 0)
    except (IndexError, KeyError, ValueError):
        raise BTFailure("not a valid bencoded string")
    if l != len(x):
        raise BTFailure("invalid bencoded value (data after valid prefix)")
    return r

from types import StringType, IntType, LongType, DictType, ListType, TupleType


class Bencached(object):

    __slots__ = ['bencoded']

    def __init__(self, s):
        self.bencoded = s

def encode_bencached(x,r):
    r.append(x.bencoded)

def encode_int(x, r):
    r.extend(('i', str(x), 'e'))

def encode_bool(x, r):
    if x:
        encode_int(1, r)
    else:
        encode_int(0, r)

def encode_string(x, r):
    r.extend((str(len(x)), ':', x))

def encode_list(x, r):
    r.append('l')
    for i in x:
        encode_func[type(i)](i, r)
    r.append('e')

def encode_dict(x,r):
    r.append('d')
    ilist = x.items()
    ilist.sort()
    for k, v in ilist:
        r.extend((str(len(k)), ':', k))
        encode_func[type(v)](v, r)
    r.append('e')

encode_func = {}
encode_func[Bencached] = encode_bencached
encode_func[IntType] = encode_int
encode_func[LongType] = encode_int
encode_func[StringType] = encode_string
encode_func[ListType] = encode_list
encode_func[TupleType] = encode_list
encode_func[DictType] = encode_dict

try:
    from types import BooleanType
    encode_func[BooleanType] = encode_bool
except ImportError:
    pass

def bencode(x):
    r = []
    encode_func[type(x)](x, r)
    return ''.join(r)

The fact is that I don't really understand how can I decode my line with this bencode.

I already tried the def bdecode but this is the output:

root@debian:/home/florian/Téléchargements# python decript.py 
Traceback (most recent call last):
  File "decript.py", line 4, in <module>
    myprint = bencode.bdecode(blabla)
  File "/usr/local/lib/python2.7/dist-packages/bencode/__init__.py", line 68, in bdecode
    raise BTFailure("not a valid bencoded string")
bencode.BTL.BTFailure: not a valid bencoded string

So I tried with the def decode_string but with decode_string(blabla, 1) it decode only the first word:

root@debian:/home/florian/Téléchargements# python decript.py 
('announce', 11)

and the number like 2, 3, 4 don't work and display error like:

root@debian:/home/florian/Téléchargements# python decript.py 
Traceback (most recent call last):
  File "decript.py", line 4, in <module>
    myprint = bencode.decode_string(blabla,10)
  File "/usr/local/lib/python2.7/dist-packages/bencode/__init__.py", line 29, in decode_string
    n = int(x[f:colon])
ValueError: invalid literal for int() with base 10: 'e70'

I want to decode all the line and I don't understand how can I do it with this bencode for example.


Solution

  • You have an incomplete Bencoded string.

    The first part tells you there is a dictionary:

    d...
    

    which is supposed to be parsed until there is an e character. There is no such character in your input string.

    A manual parse shows you have the keys announce, comment, created by, creation date, and info, where the latter is a nested dictionary with length, name, piece-length and pieces. Then your string stops; there is no value for pieces, and no e to mark the end of either the outer dictionary or the nested info dictionary. All we have is the type and length indicator: 1120.

    You could try and use the decoding functions directly, but then take into account that they return the value and the offset:

    >>> bencode.decode_string(blabla, 1)
    ('announce', 11)
    

    11 is the offset for the next value:

    >>> bencode.decode_string(blabla, 11)
    ('http://tracker.t411.io:56969/c5faa6720249d33ff6ba2af48640af89/announce', 84)
    

    and 84 is again the next:

    >>> bencode.decode_string(blabla, 84)
    ('comment', 93)
    

    If you take into account that the string is incomplete and that not all encoded objects are strings, you can still decode what little is there.

    The offset also tells you what function to use for decoding:

    >>> blabla[1]
    '8'
    >>> bencode.decode_func[blabla[1]]
    <function decode_string at 0x1004632a8>
    

    The number here spells out how many characters to expect. So skipping the failing d dictionary mapping you get:

    >>> offset = 1
    >>> while True:
    ...     value, offset = bencode.decode_func[blabla[offset]](blabla, offset)
    ...     print value
    ... 
    announce
    http://tracker.t411.io:56969/c5faa6720249d33ff6ba2af48640af89/announce
    comment
    https://www.t411.io/t/5242802
    created by
    https://www.t411.io
    creation date
    1431685353
    info
    Traceback (most recent call last):
      File "<stdin>", line 2, in <module>
      File "/Users/mj/Development/venvs/stackoverflow-2.7/lib/python2.7/site-packages/bencode/__init__.py", line 44, in decode_dict
        while x[f] != 'e':
    IndexError: string index out of range
    

    which fails because you hit the nested dictionary without e. You could extract those keys too, by adding one to the last offset:

    >>> offset
    194
    >>> blabla[offset]
    'd'
    >>> offset += 1
    >>> while True:
    ...     value, offset = bencode.decode_func[blabla[offset]](blabla, offset)
    ...     print value
    ... 
    length
    14634059
    name
    Charlie-Hebdo-1178.pdf
    piece length
    262144
    pieces
    
    Traceback (most recent call last):
      File "<stdin>", line 2, in <module>
    IndexError: string index out of range
    

    Or you could just read the data as binary data and not truncate it:

    with open(torrentfilename, 'rb') as torrentfile:
        torrent = bencode.bdecode(torrentfile.read())
    # now you have a dictionary.