dark-arts-of-tokenization/utils/combinatorics.py at main · rovle/dark-arts-of-tokenization

History

120 lines (97 loc) · 3.39 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

from functools import lru_cache

import random

import torch

from utils.tokenization_utils import tokenize

from typing import Iterable, List, Tuple

from collections import deque

@lru_cache(maxsize=None)

def partitions_of_n_with_up_to_k(n: int, k: int) -> list[list[int]]:

if n == 0:

return [[]]

if n < 0:

return []

ways = []

for num in range(1, k + 1):

sub_ways = partitions_of_n_with_up_to_k(n - num, k)

for way in sub_ways:

ways.append([num] + way)

return ways

@lru_cache(maxsize=None)

def _counts_1_k(n: int, k: int) -> List[int]:

"""

Return the list [C_0, C_1, ..., C_n] where C_r is the number of

ordered compositions of r into parts in {1, ..., k}.

Recurrence: C_0=1 and C_r = sum_{i=1..min(k,r)} C_{r-i}.

Uses an O(n) sliding-window implementation.

"""

if n < 0:

raise ValueError("n must be non-negative")

if k < 1:

raise ValueError("k must be at least 1")

C = [1] # C_0

window = deque([1]) # last up to k terms: starts with C_0

window_sum = 1

for r in range(1, n + 1):

c_r = window_sum

C.append(c_r)

window.append(c_r)

window_sum += c_r

if len(window) > k:

window_sum -= window.popleft()

return C

def sample_1_k_composition(n: int, k: int, rng=random.random) -> Tuple[int, ...]:

"""

Return a uniform random ordered composition of n into parts in {1, ..., k}.

"""

if n < 0:

raise ValueError("n must be non-negative")

if n == 0:

return tuple()

if k < 1:

raise ValueError("k must be at least 1")

C = _counts_1_k(n, k) # C[r] = number of compositions of r

r = n

out = []

while r > 0:

m = min(k, r)

# Draw i in {1..m} with probability C[r-i]/C[r]

threshold = rng() * C[r]

acc = 0

chosen = None

for i in range(1, m + 1):

acc += C[r - i]

if threshold < acc:

chosen = i

break

out.append(chosen)

r -= chosen

return tuple(out)

def tokenize_number_by_random_signature(number: int | str,

signatures_dict: dict[int, list[list[int]]] | None = None,

up_to_k_digits_token: int | None = 3 # 3 as llama "default"

) -> list[int]:

number = str(number)

if signatures_dict is None:

signature = sample_1_k_composition(len(number), k=up_to_k_digits_token)

else:

signature = random.choice(signatures_dict[len(number)])

assert len(number) == sum(signature), f"{number} -- {signature}"

tokenization = []

for i in signature:

tokenization.append(

tokenize(number[:i])[0]

)

number = number[i:]

return tokenization

def sample_k_pairs_of_numbers(acceptable_digits, k, weight_factor=None):

if weight_factor is None:

weight_factor = 1

weights = [weight_factor ** (d - min(acceptable_digits)) for d in acceptable_digits]

digit_counts = random.choices(acceptable_digits, weights=weights, k=2*k)

nums = []

for n in digit_counts:

low = 10 ** (n - 1)

high = 10 ** n - 1

nums.append(torch.randint(low=low, high=high+1, size=(1,)).item())

xs, ys = nums[:k], nums[k:]

return list(zip(xs, ys))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

combinatorics.py

Latest commit

History

combinatorics.py

File metadata and controls