I'd like to continue my last related thread in my attempt to understand and build a BitTorrent search engine. While listening the network for "get_peers" messages, I manage to grab infohashes. I proceed to ask the corresponding DHT node for it's peers. In my understanding in order to find out if the infohash is valid, (for starters) I have to send a BitTorrent handshake to the peers and compare the responses. However, besides the connection refused errors which I ignore for now, most peers reply with empty responses. Am I doing something wrong here? Note that the following code samples are not a great implementation, I just want to understand the flow.
Handshake function:
import socket
def handshake(infohash, peer):
peer_id = b"-TR2940-k8hj0wgej6ch"
handshake = b'\x13'
handshake += b'BitTorrent protocol'
handshake += b'\x00\x00\x00\x00\x00\x10\x00\x00'
handshake += infohash
handshake += peer_id
try:
ClientSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
ClientSocket.settimeout(3)
ClientSocket.connect(peer)
print("Connected to peer.")
ClientSocket.sendall(handshake)
response = ClientSocket.recv(68)
if not response:
print("Empty response.")
return
print(f"Handshake completed, resp: {response}")
ClientSocket.close()
except Exception as e:
print(e)
Utilities to get peers from given infohash and DHT node:
import random
import uuid
import bencode
import socket
from struct import unpack
import handshake
def newTID(tidlen):
tid = ""
for i in range(0, tidlen):
tid += chr(random.randint(97, 122))
return tid
def newID():
return uuid.uuid4().hex[0:20]
def split_nodes(nodes):
length = len(nodes)
if (length % 26) != 0:
return
for i in range(0, length, 26):
nid = nodes[i:i+20]
ip = socket.inet_ntoa(nodes[i+20:i+24])
port = unpack("!H", nodes[i+24:i+26])[0]
yield nid, ip, port
UDPClientSocket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP)
UDPClientSocket.settimeout(4)
def get_peers_from_infohash(infohash, node):
get_peers_query = {"t":"aa", "y":"q", "q":"get_peers", "a": {"id":newID(), "info_hash":infohash}}
get_peers_query = bencode.encode(get_peers_query)
UDPClientSocket.sendto(get_peers_query, node)
received = UDPClientSocket.recvfrom(65536)
msg = received[0]
decoded = bencode.decode(msg)
peers = split_nodes(decoded["r"]["nodes"])
for nid, ip, port in peers:
print(infohash, infohash.hex(), ip, port)
handshake.handshake(infohash, (ip, port))
My DHT crawler:
import bencode
import socket
import uuid
from struct import unpack
import threading
import random
import dhtutils
def newTID(tidlen):
tid = ""
for i in range(0, tidlen):
tid += chr(random.randint(97, 122))
return tid
def newID():
return uuid.uuid4().hex[0:20]
def handle_message(msg, node):
if msg.get("e"):
# print(msg.get("e"))
pass
elif msg.get("y") == "r":
handle_response(msg, node)
elif msg.get("y") == "q":
handle_query(msg, node)
def handle_query(msg, node):
try:
if msg["q"] == "get_peers":
infohash = msg["a"]["info_hash"]
# print(infohash.hex(), msg, node)
print(infohash.hex())
dhtutils.get_peers_from_infohash(infohash, node)
except:
pass
def handle_response(msg, node):
global all_nodes
if msg.get("r").get("nodes"):
# response from find_nodes
nodes = msg.get("r").get("nodes")
if nodes:
nodes = split_nodes(nodes)
for id, ip, port in nodes:
find_nodes(id, (ip, port))
all_nodes.append((id, (ip, port)))
elif msg.get("t") == "pg":
# response from ping
id = msg["r"]["id"]
all_nodes.append((id, node))
def split_nodes(nodes):
length = len(nodes)
if (length % 26) != 0:
return
for i in range(0, length, 26):
nid = nodes[i:i+20]
ip = socket.inet_ntoa(nodes[i+20:i+24])
port = unpack("!H", nodes[i+24:i+26])[0]
yield nid, ip, port
def find_nodes(id, node):
global UDPClientSocket
find_node_query = {"t":newTID(2), "y":"q", "q":"find_node", "a": {"id":newID(), "target":id}}
find_node_query = bencode.encode(find_node_query)
UDPClientSocket.sendto(find_node_query, node)
def ping(node):
global UDPClientSocket
ping_query = {"t":"pg", "y":"q", "q":"ping", "a":{"id":newID()}}
ping_query = bencode.encode(ping_query)
UDPClientSocket.sendto(ping_query, node)
def listen():
while True:
try:
received = UDPClientSocket.recvfrom(65536)
msg = received[0]
src = received[1]
decoded = bencode.decode(msg)
handle_message(decoded, src)
except Exception as e:
pass
UDPClientSocket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP)
T = threading.Thread(target=listen)
T.start()
nodes = [
("router.bittorrent.com", 6881),
("dht.transmissionbt.com", 6881),
("router.utorrent.com", 6881)
]
for node in nodes:
ping(node)
all_nodes = []
while True:
if len(all_nodes) > 0:
for node in all_nodes:
find_nodes(node[0], node[1])
Your DHT lookup is incorrect since you're looking at the nodes
response field which contains DHT nodes used for finding other DHT nodes when performing an iterative lookup in the DHT.
It is not the values
field that contains bittorrent contacts.
You're only going to get the latter when you have properly routed to the target region that covers an infohash in the DHT keyspace.