from langchain.llms import LlamaCpp from langchain import PromptTemplate, LLMChain from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler template = """Question: {question} Answer: Let's work this out in a step by step way to be sure we have the right answer.""" prompt = PromptTemplate(template=template, input_variables=["question"]) # Callbacks support token-wise streaming callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) # Verbose is required to pass to the callback manager n_gpu_layers = 1 # Metal set to 1 is enough. n_batch = 512 # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip. # Make sure the model path is correct for your system! llm = LlamaCpp( model_path="ggml-model-q4_0.bin", n_gpu_layers=n_gpu_layers, n_ctx=2048, ctx=2048, n_batch=n_batch, f16_kv=True, # MUST set to True, otherwise you will run into problem after a couple of calls callback_manager=callback_manager, verbose=True, ) llm_chain = LLMChain(prompt=prompt, llm=llm) question = "What NFL team won the Super Bowl in the year Justin Bieber was born?" llm_chain.run(question)