|
|
|
@ -9,9 +9,12 @@ def test_rate_limit_invoke() -> None:
|
|
|
|
|
"""Add rate limiter."""
|
|
|
|
|
|
|
|
|
|
model = GenericFakeChatModel(
|
|
|
|
|
messages=iter(["hello", "world", "!"]),
|
|
|
|
|
messages=iter(["hello", "world"]),
|
|
|
|
|
rate_limiter=InMemoryRateLimiter(
|
|
|
|
|
requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10
|
|
|
|
|
requests_per_second=20,
|
|
|
|
|
check_every_n_seconds=0.1,
|
|
|
|
|
max_bucket_size=10,
|
|
|
|
|
# At 20 requests per second we see a refresh every 0.05 seconds
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
tic = time.time()
|
|
|
|
@ -19,22 +22,14 @@ def test_rate_limit_invoke() -> None:
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
# with 0 tokens.
|
|
|
|
|
assert 0.01 < toc - tic < 0.02
|
|
|
|
|
|
|
|
|
|
tic = time.time()
|
|
|
|
|
model.invoke("foo")
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# The second time we call the model, we should have 1 extra token
|
|
|
|
|
# to proceed immediately.
|
|
|
|
|
assert toc - tic < 0.005
|
|
|
|
|
assert 0.10 < toc - tic < 0.15
|
|
|
|
|
|
|
|
|
|
# The third time we call the model, we need to wait again for a token
|
|
|
|
|
tic = time.time()
|
|
|
|
|
model.invoke("foo")
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
# with 0 tokens.
|
|
|
|
|
assert 0.01 < toc - tic < 0.02
|
|
|
|
|
# Second time we check the model, we should have 1 extra token
|
|
|
|
|
# since the sleep time is 0.1 seconds
|
|
|
|
|
assert 0.00 < toc - tic < 0.10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def test_rate_limit_ainvoke() -> None:
|
|
|
|
@ -43,7 +38,10 @@ async def test_rate_limit_ainvoke() -> None:
|
|
|
|
|
model = GenericFakeChatModel(
|
|
|
|
|
messages=iter(["hello", "world", "!"]),
|
|
|
|
|
rate_limiter=InMemoryRateLimiter(
|
|
|
|
|
requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10
|
|
|
|
|
requests_per_second=20,
|
|
|
|
|
check_every_n_seconds=0.1,
|
|
|
|
|
max_bucket_size=10,
|
|
|
|
|
# At 20 requests per second we see a refresh every 0.05 seconds
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
tic = time.time()
|
|
|
|
@ -58,7 +56,7 @@ async def test_rate_limit_ainvoke() -> None:
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# The second time we call the model, we should have 1 extra token
|
|
|
|
|
# to proceed immediately.
|
|
|
|
|
assert toc - tic < 0.01
|
|
|
|
|
assert toc - tic < 0.1
|
|
|
|
|
|
|
|
|
|
# The third time we call the model, we need to wait again for a token
|
|
|
|
|
tic = time.time()
|
|
|
|
@ -74,17 +72,16 @@ def test_rate_limit_batch() -> None:
|
|
|
|
|
model = GenericFakeChatModel(
|
|
|
|
|
messages=iter(["hello", "world", "!"]),
|
|
|
|
|
rate_limiter=InMemoryRateLimiter(
|
|
|
|
|
requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10
|
|
|
|
|
requests_per_second=20,
|
|
|
|
|
check_every_n_seconds=0.01,
|
|
|
|
|
max_bucket_size=10,
|
|
|
|
|
# At 20 requests per second we see a refresh every 0.05 seconds
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
# Need 2 tokens to proceed
|
|
|
|
|
time_to_fill = 2 / 200.0
|
|
|
|
|
tic = time.time()
|
|
|
|
|
model.batch(["foo", "foo"])
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
# with 0 tokens.
|
|
|
|
|
assert time_to_fill < toc - tic < time_to_fill + 0.03
|
|
|
|
|
assert 0.1 < toc - tic < 0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def test_rate_limit_abatch() -> None:
|
|
|
|
@ -92,17 +89,16 @@ async def test_rate_limit_abatch() -> None:
|
|
|
|
|
model = GenericFakeChatModel(
|
|
|
|
|
messages=iter(["hello", "world", "!"]),
|
|
|
|
|
rate_limiter=InMemoryRateLimiter(
|
|
|
|
|
requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10
|
|
|
|
|
requests_per_second=20,
|
|
|
|
|
check_every_n_seconds=0.01,
|
|
|
|
|
max_bucket_size=10,
|
|
|
|
|
# At 20 requests per second we see a refresh every 0.05 seconds
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
# Need 2 tokens to proceed
|
|
|
|
|
time_to_fill = 2 / 200.0
|
|
|
|
|
tic = time.time()
|
|
|
|
|
await model.abatch(["foo", "foo"])
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
# with 0 tokens.
|
|
|
|
|
assert time_to_fill < toc - tic < time_to_fill + 0.03
|
|
|
|
|
assert 0.1 < toc - tic < 0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_rate_limit_stream() -> None:
|
|
|
|
@ -110,7 +106,10 @@ def test_rate_limit_stream() -> None:
|
|
|
|
|
model = GenericFakeChatModel(
|
|
|
|
|
messages=iter(["hello world", "hello world", "hello world"]),
|
|
|
|
|
rate_limiter=InMemoryRateLimiter(
|
|
|
|
|
requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10
|
|
|
|
|
requests_per_second=20,
|
|
|
|
|
check_every_n_seconds=0.1,
|
|
|
|
|
max_bucket_size=10,
|
|
|
|
|
# At 20 requests per second we see a refresh every 0.05 seconds
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
# Check astream
|
|
|
|
@ -119,7 +118,7 @@ def test_rate_limit_stream() -> None:
|
|
|
|
|
assert [msg.content for msg in response] == ["hello", " ", "world"]
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
assert 0.01 < toc - tic < 0.02 # Slightly smaller than check every n seconds
|
|
|
|
|
assert 0.1 < toc - tic < 0.2
|
|
|
|
|
|
|
|
|
|
# Second time around we should have 1 token left
|
|
|
|
|
tic = time.time()
|
|
|
|
@ -127,44 +126,46 @@ def test_rate_limit_stream() -> None:
|
|
|
|
|
assert [msg.content for msg in response] == ["hello", " ", "world"]
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
assert toc - tic < 0.005 # Slightly smaller than check every n seconds
|
|
|
|
|
assert toc - tic < 0.1 # Slightly smaller than check every n seconds
|
|
|
|
|
|
|
|
|
|
# Third time around we should have 0 tokens left
|
|
|
|
|
tic = time.time()
|
|
|
|
|
response = list(model.stream("foo"))
|
|
|
|
|
assert [msg.content for msg in response] == ["hello", " ", "world"]
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
assert 0.01 < toc - tic < 0.02 # Slightly smaller than check every n seconds
|
|
|
|
|
assert 0.1 < toc - tic < 0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def test_rate_limit_astream() -> None:
|
|
|
|
|
"""Test rate limiting astream."""
|
|
|
|
|
rate_limiter = InMemoryRateLimiter(
|
|
|
|
|
requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10
|
|
|
|
|
)
|
|
|
|
|
model = GenericFakeChatModel(
|
|
|
|
|
messages=iter(["hello world", "hello world", "hello world"]),
|
|
|
|
|
rate_limiter=rate_limiter,
|
|
|
|
|
rate_limiter=InMemoryRateLimiter(
|
|
|
|
|
requests_per_second=20,
|
|
|
|
|
check_every_n_seconds=0.1,
|
|
|
|
|
max_bucket_size=10,
|
|
|
|
|
# At 20 requests per second we see a refresh every 0.05 seconds
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
# Check astream
|
|
|
|
|
tic = time.time()
|
|
|
|
|
response = [chunk async for chunk in model.astream("foo")]
|
|
|
|
|
response = [msg async for msg in model.astream("foo")]
|
|
|
|
|
assert [msg.content for msg in response] == ["hello", " ", "world"]
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
assert 0.1 < toc - tic < 0.2
|
|
|
|
|
|
|
|
|
|
# Second time around we should have 1 token left
|
|
|
|
|
tic = time.time()
|
|
|
|
|
response = [chunk async for chunk in model.astream("foo")]
|
|
|
|
|
response = [msg async for msg in model.astream("foo")]
|
|
|
|
|
assert [msg.content for msg in response] == ["hello", " ", "world"]
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
assert toc - tic < 0.01 # Slightly smaller than check every n seconds
|
|
|
|
|
assert toc - tic < 0.1 # Slightly smaller than check every n seconds
|
|
|
|
|
|
|
|
|
|
# Third time around we should have 0 tokens left
|
|
|
|
|
tic = time.time()
|
|
|
|
|
response = [chunk async for chunk in model.astream("foo")]
|
|
|
|
|
response = [msg async for msg in model.astream("foo")]
|
|
|
|
|
assert [msg.content for msg in response] == ["hello", " ", "world"]
|
|
|
|
|
toc = time.time()
|
|
|
|
|
assert 0.1 < toc - tic < 0.2
|
|
|
|
@ -176,7 +177,10 @@ def test_rate_limit_skips_cache() -> None:
|
|
|
|
|
model = GenericFakeChatModel(
|
|
|
|
|
messages=iter(["hello", "world", "!"]),
|
|
|
|
|
rate_limiter=InMemoryRateLimiter(
|
|
|
|
|
requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1
|
|
|
|
|
requests_per_second=20,
|
|
|
|
|
check_every_n_seconds=0.1,
|
|
|
|
|
max_bucket_size=1,
|
|
|
|
|
# At 20 requests per second we see a refresh every 0.05 seconds
|
|
|
|
|
),
|
|
|
|
|
cache=cache,
|
|
|
|
|
)
|
|
|
|
@ -186,7 +190,7 @@ def test_rate_limit_skips_cache() -> None:
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
# with 0 tokens.
|
|
|
|
|
assert 0.01 < toc - tic < 0.02
|
|
|
|
|
assert 0.1 < toc - tic < 0.2
|
|
|
|
|
|
|
|
|
|
for _ in range(2):
|
|
|
|
|
# Cache hits
|
|
|
|
@ -195,7 +199,7 @@ def test_rate_limit_skips_cache() -> None:
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
# with 0 tokens.
|
|
|
|
|
assert toc - tic < 0.005
|
|
|
|
|
assert toc - tic < 0.05
|
|
|
|
|
|
|
|
|
|
# Test verifies that there's only a single key
|
|
|
|
|
# Test also verifies that rate_limiter information is not part of the
|
|
|
|
@ -236,7 +240,7 @@ async def test_rate_limit_skips_cache_async() -> None:
|
|
|
|
|
model = GenericFakeChatModel(
|
|
|
|
|
messages=iter(["hello", "world", "!"]),
|
|
|
|
|
rate_limiter=InMemoryRateLimiter(
|
|
|
|
|
requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1
|
|
|
|
|
requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=1
|
|
|
|
|
),
|
|
|
|
|
cache=cache,
|
|
|
|
|
)
|
|
|
|
@ -246,7 +250,7 @@ async def test_rate_limit_skips_cache_async() -> None:
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
# with 0 tokens.
|
|
|
|
|
assert 0.01 < toc - tic < 0.02
|
|
|
|
|
assert 0.1 < toc - tic < 0.2
|
|
|
|
|
|
|
|
|
|
for _ in range(2):
|
|
|
|
|
# Cache hits
|
|
|
|
@ -255,4 +259,4 @@ async def test_rate_limit_skips_cache_async() -> None:
|
|
|
|
|
toc = time.time()
|
|
|
|
|
# Should be larger than check every n seconds since the token bucket starts
|
|
|
|
|
# with 0 tokens.
|
|
|
|
|
assert toc - tic < 0.005
|
|
|
|
|
assert toc - tic < 0.05
|
|
|
|
|