dark-arts-of-tokenization/utils/top_n_continuations.py at main · rovle/dark-arts-of-tokenization

History

230 lines (188 loc) · 8.05 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

"""

Exact-ish n-best search for Transformers LLMs

================================================

Implements the branch-and-bound algorithm.

It guarantees (with probability ≤ δ) that every length-k continuation whose

probability ranks in the true top-n set is returned.

Requires:

pip install transformers torch tqdm

This file can be imported:

from top_n_continuations import top_n_continuations

Or run directly:

python top_n_continuations.py

"""

from __future__ import annotations

import heapq

import math

from dataclasses import dataclass, field

from typing import List, Tuple

import torch

from transformers import AutoModelForCausalLM, AutoTokenizer

from tqdm import tqdm

# -----------------------------------------------------------------------------

# Helper dataclass to keep queue entries tidy

# -----------------------------------------------------------------------------

@dataclass(order=True)

class _PQItem:

# NOTE: heapq is a min-heap, so we negate the bound for max-priority

neg_upper_bound: float

seq: Tuple[int, ...] = field(compare=False)

logp: float = field(compare=False)

# -----------------------------------------------------------------------------

# Core algorithm

# -----------------------------------------------------------------------------

def top_n_continuations(

prefix: str,

model: "AutoModelForCausalLM",

tokenizer: "AutoTokenizer",

k: int = 5,

n: int = 5,

delta: float = 1e-6,

p_star: float = 1.0,

device: str | torch.device = "cuda",

show_progress: bool = True

) -> List[Tuple[str, float, Tuple[int, ...]]]:

"""Return *n* most-likely *k*-token continuations of *prefix*.

Args:

prefix: Prompt prefix (string).

model: Huggingface causal LM (*in eval mode*).

tokenizer: Matching tokenizer.

k: Desired continuation length (tokens).

n: Number of completions to return.

delta: Total miss-probability budget.

p_star: Upper bound for any single token probability. For a strict

guarantee, this should be 1.0. Using a smaller value

(e.g., 0.9) can speed up the search but makes the

guarantee heuristic.

device: CUDA / CPU device.

show_progress: Whether to show progress bars.

Returns:

List of (text, log_probability, token_sequence) tuples sorted best→worst.

"""

model.eval()

model.to(device)

# Pre-tokenise the immutable prefix once

with torch.no_grad():

prefix_ids: List[int] = tokenizer.encode(prefix, add_special_tokens=False)

prefix_tensor = torch.tensor([prefix_ids], device=device)

epsilon = delta / k # per-step pruning budget

# Priority queue for search states (max-heap via negative bounds)

queue: List[_PQItem] = []

heapq.heappush(queue, _PQItem(neg_upper_bound=0.0, seq=tuple(), logp=0.0))

# Min-heap to store the top N completed sequences found so far

found: List[Tuple[float, Tuple[int, ...]]] = []

# Progress tracking

if show_progress:

pbar = tqdm(total=n, desc="Finding completions", unit="completion")

iterations = 0

max_queue_size = 0

while queue:

iterations += 1

max_queue_size = max(max_queue_size, len(queue))

# Early-exit: if the best possible score in the queue is worse than the

# worst score we have already found, we can stop.

if len(found) >= n:

worst_logp_found = found[0][0] # Min-heap's root is the smallest

best_upper_bound_in_queue = -queue[0].neg_upper_bound

if best_upper_bound_in_queue < worst_logp_found:

break

item = heapq.heappop(queue)

seq, logp_prefix = item.seq, item.logp

if len(seq) == k:

if len(found) < n:

heapq.heappush(found, (logp_prefix, seq))

if show_progress:

pbar.update(1)

elif logp_prefix > found[0][0]:

heapq.heapreplace(found, (logp_prefix, seq))

if show_progress:

# The progress bar total is 'n', so we don't update it

# when a better item replaces a found one.

pass

if show_progress:

pbar.set_postfix({

'queue_size': len(queue),

'iterations': iterations,

'worst_found_logp': f"{found[0][0]:.3f}"

})

continue

# -------------------------------------------------------------

# Expand this node

# -------------------------------------------------------------

context_ids = prefix_ids + list(seq)

context_tensor = torch.tensor([context_ids], device=device)

with torch.no_grad():

logits = model(context_tensor).logits[:, -1, :] # (1, |V|)

probs = torch.softmax(logits, dim=-1).squeeze(0) # (|V|,)

# Top-M pruning to keep ≥1-epsilon mass

sorted_probs, sorted_indices = torch.sort(probs, descending=True)

cumulative = torch.cumsum(sorted_probs, dim=0)

M = int((cumulative < (1 - epsilon)).sum()) + 1

top_indices = sorted_indices[:M]

top_probs = sorted_probs[:M]

for token_id, p_tok in zip(top_indices.tolist(), top_probs.tolist()):

new_seq = seq + (token_id,)

new_logp = logp_prefix + math.log(p_tok)

remaining = k - len(new_seq)

upper_bound = new_logp + remaining * math.log(p_star)

heapq.heappush(queue, _PQItem(-upper_bound, new_seq, new_logp))

# Update progress bar with current search state

if show_progress and iterations % 10 == 0: # Update every 10 iterations to avoid overhead

pbar.set_postfix({

'queue_size': len(queue),

'iterations': iterations,

'depth': len(seq),

'expansions': M

})

if show_progress:

pbar.close()

print(f"Search completed: {iterations} iterations, max queue size: {max_queue_size}")

# -----------------------------------------------------------------

# Convert token sequences to strings and sort best→worst

# -----------------------------------------------------------------

def decode(seq_ids: Tuple[int, ...]) -> str:

return tokenizer.decode(list(seq_ids), skip_special_tokens=True)

# The 'found' min-heap is not sorted, so we sort it now.

found_sorted = sorted(found, key=lambda x: x[0], reverse=True)

return [(decode(seq), logp, seq) for logp, seq in found_sorted]

def load_model_and_tokenizer(model_name: str, device: str = "cuda") -> Tuple[AutoModelForCausalLM, AutoTokenizer]:

"""Load model and tokenizer with progress indication."""

print(f"Loading model: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(

model_name,

device_map="auto",

dtype=torch.bfloat16

)

print("Model loaded successfully!")

return model, tokenizer

def main():

"""Example usage of the top_n_continuations function."""

import textwrap

# Configuration

prefix = "You're interested in rationality and AI? You should visit"

k = 6

n = 16

model_name = "meta-llama/Llama-3.2-3B"

delta = 1e-6

device = "cuda"

# Load model

model, tokenizer = load_model_and_tokenizer(model_name, device)

# Find continuations

print(f"\nFinding top {n} continuations of length {k} for:")

print(f"Prefix: {prefix!r}\n")

results = top_n_continuations(

prefix, model, tokenizer, k=k, n=n, delta=delta, device=device

)

# Display results

print(f"\nResults:")

print("=" * 88)

for i, (cont, logp, token_seq) in enumerate(results, 1):

text = prefix + cont

prob_percent = 100 * math.exp(logp)

print(f"#{i}\tlog p = {logp:.4f} ({prob_percent:.4f}%)")

print(f"Tokens: {token_seq}")

print(textwrap.fill(text, width=88))

print("-" * 88)

if __name__ == "__main__":

main()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

top_n_continuations.py

Latest commit

History

top_n_continuations.py

File metadata and controls