diff --git a/.circleci/config.yml b/.circleci/config.yml
index c94adefc..c30ed3cf 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -170,25 +170,25 @@ workflows:
             branches:
               only:
                 - main
-  # build-py-deploy:
-  #   jobs:
-  #     - build-py-linux:
-  #         filters:
-  #           branches:
-  #             only:
-  #     - build-py-macos:
-  #         filters:
-  #           branches:
-  #             only:
-  #     - build-py-windows:
-  #         filters:
-  #           branches:
-  #             only:
-  #     - store-and-upload-wheels:
-  #         filters:
-  #           branches:
-  #             only:
-  #         requires:
-  #           - build-py-windows
-  #           - build-py-linux
-  #           - build-py-macos
+  build-py-deploy:
+    jobs:
+      - build-py-linux:
+          filters:
+            branches:
+              only:
+      - build-py-macos:
+          filters:
+            branches:
+              only:
+      - build-py-windows:
+          filters:
+            branches:
+              only:
+      - store-and-upload-wheels:
+          filters:
+            branches:
+              only:
+          requires:
+            - build-py-windows
+            - build-py-linux
+            - build-py-macos
diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
index 94f6df9a..bee3f3cd 100644
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -155,24 +155,26 @@ class GPT4All():
         print("Model downloaded at: " + download_path)
         return download_path
 
-    def generate(self, prompt: str, **generate_kwargs) -> str:
+    def generate(self, prompt: str, streaming: bool = False, **generate_kwargs) -> str:
         """
         Surfaced method of running generate without accessing model object.
 
         Args:
             prompt: Raw string to be passed to model.
+            streaming: True if want output streamed to stdout.
             **generate_kwargs: Optional kwargs to pass to prompt context.
         
         Returns:
             Raw string of generated model response.
         """
-        return self.model.generate(prompt, **generate_kwargs)
+        return self.model.generate(prompt, streaming=streaming, **generate_kwargs)
     
     def chat_completion(self, 
                         messages: List[Dict], 
                         default_prompt_header: bool = True, 
                         default_prompt_footer: bool = True, 
                         verbose: bool = True,
+                        streaming: bool = True,
                         **generate_kwargs) -> str:
         """
         Format list of message dictionaries into a prompt and call model
@@ -189,6 +191,7 @@ class GPT4All():
                 before user/assistant role messages.
             default_prompt_footer: If True (default), add default footer at end of prompt.
             verbose: If True (default), print full prompt and generated response.
+            streaming: True if want output streamed to stdout.
             **generate_kwargs: Optional kwargs to pass to prompt context.
 
         Returns:
@@ -206,7 +209,7 @@ class GPT4All():
         if verbose:
             print(full_prompt)
 
-        response = self.model.generate(full_prompt, **generate_kwargs)
+        response = self.model.generate(full_prompt, streaming=streaming, **generate_kwargs)
 
         if verbose:
             print(response)
diff --git a/gpt4all-bindings/python/gpt4all/pyllmodel.py b/gpt4all-bindings/python/gpt4all/pyllmodel.py
index a1f29f4d..f7d32399 100644
--- a/gpt4all-bindings/python/gpt4all/pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/pyllmodel.py
@@ -1,25 +1,23 @@
-from io import StringIO
 import pkg_resources
 import ctypes
 import os
 import platform
 import re
+import subprocess
 import sys
 
-class DualOutput:
-    def __init__(self, stdout, string_io):
-        self.stdout = stdout
-        self.string_io = string_io
+class DualStreamProcessor:
+    def __init__(self, stream=None):
+        self.stream = stream
+        self.output = ""
 
     def write(self, text):
-        self.stdout.write(text)
-        self.string_io.write(text)
+        cleaned_text = re.sub(r"\n(?!\n)", "", text)
+        if self.stream is not None:
+            self.stream.write(cleaned_text)
+            self.stream.flush()
+        self.output += cleaned_text
 
-    def flush(self):
-        # It's a good idea to also define a flush method that flushes both
-        # outputs, as sys.stdout is expected to have this method.
-        self.stdout.flush()
-        self.string_io.flush()
 
 # TODO: provide a config file to make this more robust
 LLMODEL_PATH = os.path.join("llmodel_DO_NOT_MODIFY", "build").replace("\\", "\\\\")
@@ -175,7 +173,7 @@ class LLModel:
                  repeat_penalty: float = 1.2, 
                  repeat_last_n: int = 10, 
                  context_erase: float = .5,
-                 std_passthrough: bool = False) -> str:
+                 streaming: bool = False) -> str:
         """
         Generate response from model from a prompt.
 
@@ -183,12 +181,8 @@ class LLModel:
         ----------
         prompt: str
             Question, task, or conversation for model to respond to
-        add_default_header: bool, optional
-            Whether to add a prompt header (default is True)
-        add_default_footer: bool, optional
-            Whether to add a prompt footer (default is True)
-        verbose: bool, optional
-            Whether to print prompt and response
+        streaming: bool
+            Stream response to stdout
 
         Returns
         -------
@@ -198,13 +192,14 @@ class LLModel:
         prompt = prompt.encode('utf-8')
         prompt = ctypes.c_char_p(prompt)
 
-        # Change stdout to StringIO so we can collect response
         old_stdout = sys.stdout 
-        collect_response = StringIO()
-        if std_passthrough:
-            sys.stdout = DualOutput(old_stdout, collect_response)
-        else:
-            sys.stdout = collect_response
+
+        stream_processor = DualStreamProcessor()
+    
+        if streaming:
+            stream_processor.stream = sys.stdout
+        
+        sys.stdout = stream_processor
 
         context = LLModelPromptContext(
             logits_size=logits_size, 
@@ -227,14 +222,11 @@ class LLModel:
                                ResponseCallback(self._response_callback), 
                                RecalculateCallback(self._recalculate_callback), 
                                context)
-        
-        response = collect_response.getvalue()
+
+        # Revert to old stdout
         sys.stdout = old_stdout
 
-        # Remove the unnecessary new lines from response
-        response = re.sub(r"\n(?!\n)", "", response).strip()
-        
-        return response
+        return stream_processor.output
 
     # Empty prompt callback
     @staticmethod
diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py
index 25433efb..2d8e7614 100644
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@@ -78,6 +78,8 @@ setup(
         'dev': [
             'pytest',
             'twine',
+            'wheel',
+            'setuptools',
             'mkdocs-material',
             'mkautodoc',
             'mkdocstrings[python]',