From 28e0958ff4e73cb0beab68fa31de628654162474 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 7 Aug 2024 10:04:58 -0400 Subject: [PATCH] core[patch]: Relax rate limit unit tests in terms of timing (#25140) Relax rate limit unit tests --- .../chat_models/test_rate_limiting.py | 98 ++++++++++--------- 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py b/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py index 2d251aadb9..ac633b8263 100644 --- a/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py +++ b/libs/core/tests/unit_tests/language_models/chat_models/test_rate_limiting.py @@ -9,9 +9,12 @@ def test_rate_limit_invoke() -> None: """Add rate limiter.""" model = GenericFakeChatModel( - messages=iter(["hello", "world", "!"]), + messages=iter(["hello", "world"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) tic = time.time() @@ -19,22 +22,14 @@ def test_rate_limit_invoke() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert 0.01 < toc - tic < 0.02 - - tic = time.time() - model.invoke("foo") - toc = time.time() - # The second time we call the model, we should have 1 extra token - # to proceed immediately. - assert toc - tic < 0.005 + assert 0.10 < toc - tic < 0.15 - # The third time we call the model, we need to wait again for a token tic = time.time() model.invoke("foo") toc = time.time() - # Should be larger than check every n seconds since the token bucket starts - # with 0 tokens. - assert 0.01 < toc - tic < 0.02 + # Second time we check the model, we should have 1 extra token + # since the sleep time is 0.1 seconds + assert 0.00 < toc - tic < 0.10 async def test_rate_limit_ainvoke() -> None: @@ -43,7 +38,10 @@ async def test_rate_limit_ainvoke() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) tic = time.time() @@ -58,7 +56,7 @@ async def test_rate_limit_ainvoke() -> None: toc = time.time() # The second time we call the model, we should have 1 extra token # to proceed immediately. - assert toc - tic < 0.01 + assert toc - tic < 0.1 # The third time we call the model, we need to wait again for a token tic = time.time() @@ -74,17 +72,16 @@ def test_rate_limit_batch() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.01, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) - # Need 2 tokens to proceed - time_to_fill = 2 / 200.0 tic = time.time() model.batch(["foo", "foo"]) toc = time.time() - # Should be larger than check every n seconds since the token bucket starts - # with 0 tokens. - assert time_to_fill < toc - tic < time_to_fill + 0.03 + assert 0.1 < toc - tic < 0.2 async def test_rate_limit_abatch() -> None: @@ -92,17 +89,16 @@ async def test_rate_limit_abatch() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.01, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) - # Need 2 tokens to proceed - time_to_fill = 2 / 200.0 tic = time.time() await model.abatch(["foo", "foo"]) toc = time.time() - # Should be larger than check every n seconds since the token bucket starts - # with 0 tokens. - assert time_to_fill < toc - tic < time_to_fill + 0.03 + assert 0.1 < toc - tic < 0.2 def test_rate_limit_stream() -> None: @@ -110,7 +106,10 @@ def test_rate_limit_stream() -> None: model = GenericFakeChatModel( messages=iter(["hello world", "hello world", "hello world"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=200, check_every_n_seconds=0.01, max_bucket_size=10 + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds ), ) # Check astream @@ -119,7 +118,7 @@ def test_rate_limit_stream() -> None: assert [msg.content for msg in response] == ["hello", " ", "world"] toc = time.time() # Should be larger than check every n seconds since the token bucket starts - assert 0.01 < toc - tic < 0.02 # Slightly smaller than check every n seconds + assert 0.1 < toc - tic < 0.2 # Second time around we should have 1 token left tic = time.time() @@ -127,44 +126,46 @@ def test_rate_limit_stream() -> None: assert [msg.content for msg in response] == ["hello", " ", "world"] toc = time.time() # Should be larger than check every n seconds since the token bucket starts - assert toc - tic < 0.005 # Slightly smaller than check every n seconds + assert toc - tic < 0.1 # Slightly smaller than check every n seconds # Third time around we should have 0 tokens left tic = time.time() response = list(model.stream("foo")) assert [msg.content for msg in response] == ["hello", " ", "world"] toc = time.time() - # Should be larger than check every n seconds since the token bucket starts - assert 0.01 < toc - tic < 0.02 # Slightly smaller than check every n seconds + assert 0.1 < toc - tic < 0.2 async def test_rate_limit_astream() -> None: """Test rate limiting astream.""" - rate_limiter = InMemoryRateLimiter( - requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=10 - ) model = GenericFakeChatModel( messages=iter(["hello world", "hello world", "hello world"]), - rate_limiter=rate_limiter, + rate_limiter=InMemoryRateLimiter( + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=10, + # At 20 requests per second we see a refresh every 0.05 seconds + ), ) # Check astream tic = time.time() - response = [chunk async for chunk in model.astream("foo")] + response = [msg async for msg in model.astream("foo")] assert [msg.content for msg in response] == ["hello", " ", "world"] toc = time.time() + # Should be larger than check every n seconds since the token bucket starts assert 0.1 < toc - tic < 0.2 # Second time around we should have 1 token left tic = time.time() - response = [chunk async for chunk in model.astream("foo")] + response = [msg async for msg in model.astream("foo")] assert [msg.content for msg in response] == ["hello", " ", "world"] toc = time.time() # Should be larger than check every n seconds since the token bucket starts - assert toc - tic < 0.01 # Slightly smaller than check every n seconds + assert toc - tic < 0.1 # Slightly smaller than check every n seconds # Third time around we should have 0 tokens left tic = time.time() - response = [chunk async for chunk in model.astream("foo")] + response = [msg async for msg in model.astream("foo")] assert [msg.content for msg in response] == ["hello", " ", "world"] toc = time.time() assert 0.1 < toc - tic < 0.2 @@ -176,7 +177,10 @@ def test_rate_limit_skips_cache() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1 + requests_per_second=20, + check_every_n_seconds=0.1, + max_bucket_size=1, + # At 20 requests per second we see a refresh every 0.05 seconds ), cache=cache, ) @@ -186,7 +190,7 @@ def test_rate_limit_skips_cache() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert 0.01 < toc - tic < 0.02 + assert 0.1 < toc - tic < 0.2 for _ in range(2): # Cache hits @@ -195,7 +199,7 @@ def test_rate_limit_skips_cache() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert toc - tic < 0.005 + assert toc - tic < 0.05 # Test verifies that there's only a single key # Test also verifies that rate_limiter information is not part of the @@ -236,7 +240,7 @@ async def test_rate_limit_skips_cache_async() -> None: model = GenericFakeChatModel( messages=iter(["hello", "world", "!"]), rate_limiter=InMemoryRateLimiter( - requests_per_second=100, check_every_n_seconds=0.01, max_bucket_size=1 + requests_per_second=20, check_every_n_seconds=0.1, max_bucket_size=1 ), cache=cache, ) @@ -246,7 +250,7 @@ async def test_rate_limit_skips_cache_async() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert 0.01 < toc - tic < 0.02 + assert 0.1 < toc - tic < 0.2 for _ in range(2): # Cache hits @@ -255,4 +259,4 @@ async def test_rate_limit_skips_cache_async() -> None: toc = time.time() # Should be larger than check every n seconds since the token bucket starts # with 0 tokens. - assert toc - tic < 0.005 + assert toc - tic < 0.05