@ -65,7 +65,7 @@ logger = logging.getLogger(__name__)
logger . setLevel ( logging . INFO )
logger . setLevel ( logging . INFO )
handler = logging . StreamHandler ( )
handler = logging . StreamHandler ( )
handler . setLevel ( logging . INFO )
handler . setLevel ( logging . INFO )
formatter = logging . Formatter ( ' %(asctime)s - %(name)s - %(levelname)s - %(message)s ' )
formatter = logging . Formatter ( " %(asctime)s - %(name)s - %(levelname)s - %(message)s " )
handler . setFormatter ( formatter )
handler . setFormatter ( formatter )
logger . addHandler ( handler )
logger . addHandler ( handler )
@ -100,10 +100,16 @@ def load_pipes(local_deployment):
if local_deployment in [ " full " ] :
if local_deployment in [ " full " ] :
other_pipes = {
other_pipes = {
" nlpconnect/vit-gpt2-image-captioning " : {
" nlpconnect/vit-gpt2-image-captioning " : {
" model " : VisionEncoderDecoderModel . from_pretrained ( f " { local_fold } /nlpconnect/vit-gpt2-image-captioning " ) ,
" model " : VisionEncoderDecoderModel . from_pretrained (
" feature_extractor " : ViTImageProcessor . from_pretrained ( f " { local_fold } /nlpconnect/vit-gpt2-image-captioning " ) ,
f " { local_fold } /nlpconnect/vit-gpt2-image-captioning "
" tokenizer " : AutoTokenizer . from_pretrained ( f " { local_fold } /nlpconnect/vit-gpt2-image-captioning " ) ,
) ,
" device " : device
" feature_extractor " : ViTImageProcessor . from_pretrained (
f " { local_fold } /nlpconnect/vit-gpt2-image-captioning "
) ,
" tokenizer " : AutoTokenizer . from_pretrained (
f " { local_fold } /nlpconnect/vit-gpt2-image-captioning "
) ,
" device " : device ,
} ,
} ,
# "Salesforce/blip-image-captioning-large": {
# "Salesforce/blip-image-captioning-large": {
# "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
# "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
@ -111,8 +117,12 @@ def load_pipes(local_deployment):
# "device": device
# "device": device
# },
# },
" damo-vilab/text-to-video-ms-1.7b " : {
" damo-vilab/text-to-video-ms-1.7b " : {
" model " : DiffusionPipeline . from_pretrained ( f " { local_fold } /damo-vilab/text-to-video-ms-1.7b " , torch_dtype = torch . float16 , variant = " fp16 " ) ,
" model " : DiffusionPipeline . from_pretrained (
" device " : device
f " { local_fold } /damo-vilab/text-to-video-ms-1.7b " ,
torch_dtype = torch . float16 ,
variant = " fp16 " ,
) ,
" device " : device ,
} ,
} ,
# "facebook/maskformer-swin-large-ade": {
# "facebook/maskformer-swin-large-ade": {
# "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
# "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
@ -130,16 +140,22 @@ def load_pipes(local_deployment):
# "device": device
# "device": device
# },
# },
" JorisCos/DCCRNet_Libri1Mix_enhsingle_16k " : {
" JorisCos/DCCRNet_Libri1Mix_enhsingle_16k " : {
" model " : BaseModel . from_pretrained ( " JorisCos/DCCRNet_Libri1Mix_enhsingle_16k " ) ,
" model " : BaseModel . from_pretrained (
" device " : device
" JorisCos/DCCRNet_Libri1Mix_enhsingle_16k "
) ,
" device " : device ,
} ,
} ,
" espnet/kan-bayashi_ljspeech_vits " : {
" espnet/kan-bayashi_ljspeech_vits " : {
" model " : Text2Speech . from_pretrained ( " espnet/kan-bayashi_ljspeech_vits " ) ,
" model " : Text2Speech . from_pretrained (
" device " : device
" espnet/kan-bayashi_ljspeech_vits "
) ,
" device " : device ,
} ,
} ,
" lambdalabs/sd-image-variations-diffusers " : {
" lambdalabs/sd-image-variations-diffusers " : {
" model " : DiffusionPipeline . from_pretrained ( f " { local_fold } /lambdalabs/sd-image-variations-diffusers " ) , # torch_dtype=torch.float16
" model " : DiffusionPipeline . from_pretrained (
" device " : device
f " { local_fold } /lambdalabs/sd-image-variations-diffusers "
) , # torch_dtype=torch.float16
" device " : device ,
} ,
} ,
# "CompVis/stable-diffusion-v1-4": {
# "CompVis/stable-diffusion-v1-4": {
# "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"),
# "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"),
@ -150,8 +166,10 @@ def load_pipes(local_deployment):
# "device": device
# "device": device
# },
# },
" runwayml/stable-diffusion-v1-5 " : {
" runwayml/stable-diffusion-v1-5 " : {
" model " : DiffusionPipeline . from_pretrained ( f " { local_fold } /runwayml/stable-diffusion-v1-5 " ) ,
" model " : DiffusionPipeline . from_pretrained (
" device " : device
f " { local_fold } /runwayml/stable-diffusion-v1-5 "
) ,
" device " : device ,
} ,
} ,
# "microsoft/speecht5_tts":{
# "microsoft/speecht5_tts":{
# "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
# "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
@ -165,11 +183,19 @@ def load_pipes(local_deployment):
# "device": device
# "device": device
# },
# },
" microsoft/speecht5_vc " : {
" microsoft/speecht5_vc " : {
" processor " : SpeechT5Processor . from_pretrained ( f " { local_fold } /microsoft/speecht5_vc " ) ,
" processor " : SpeechT5Processor . from_pretrained (
" model " : SpeechT5ForSpeechToSpeech . from_pretrained ( f " { local_fold } /microsoft/speecht5_vc " ) ,
f " { local_fold } /microsoft/speecht5_vc "
" vocoder " : SpeechT5HifiGan . from_pretrained ( f " { local_fold } /microsoft/speecht5_hifigan " ) ,
) ,
" embeddings_dataset " : load_dataset ( f " { local_fold } /Matthijs/cmu-arctic-xvectors " , split = " validation " ) ,
" model " : SpeechT5ForSpeechToSpeech . from_pretrained (
" device " : device
f " { local_fold } /microsoft/speecht5_vc "
) ,
" vocoder " : SpeechT5HifiGan . from_pretrained (
f " { local_fold } /microsoft/speecht5_hifigan "
) ,
" embeddings_dataset " : load_dataset (
f " { local_fold } /Matthijs/cmu-arctic-xvectors " , split = " validation "
) ,
" device " : device ,
} ,
} ,
# "julien-c/wine-quality": {
# "julien-c/wine-quality": {
# "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib")))
# "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib")))
@ -180,15 +206,23 @@ def load_pipes(local_deployment):
# "device": device
# "device": device
# },
# },
" facebook/maskformer-swin-base-coco " : {
" facebook/maskformer-swin-base-coco " : {
" feature_extractor " : MaskFormerFeatureExtractor . from_pretrained ( f " { local_fold } /facebook/maskformer-swin-base-coco " ) ,
" feature_extractor " : MaskFormerFeatureExtractor . from_pretrained (
" model " : MaskFormerForInstanceSegmentation . from_pretrained ( f " { local_fold } /facebook/maskformer-swin-base-coco " ) ,
f " { local_fold } /facebook/maskformer-swin-base-coco "
" device " : device
) ,
" model " : MaskFormerForInstanceSegmentation . from_pretrained (
f " { local_fold } /facebook/maskformer-swin-base-coco "
) ,
" device " : device ,
} ,
} ,
" Intel/dpt-hybrid-midas " : {
" Intel/dpt-hybrid-midas " : {
" model " : DPTForDepthEstimation . from_pretrained ( f " { local_fold } /Intel/dpt-hybrid-midas " , low_cpu_mem_usage = True ) ,
" model " : DPTForDepthEstimation . from_pretrained (
" feature_extractor " : DPTFeatureExtractor . from_pretrained ( f " { local_fold } /Intel/dpt-hybrid-midas " ) ,
f " { local_fold } /Intel/dpt-hybrid-midas " , low_cpu_mem_usage = True
" device " : device
) ,
}
" feature_extractor " : DPTFeatureExtractor . from_pretrained (
f " { local_fold } /Intel/dpt-hybrid-midas "
) ,
" device " : device ,
} ,
}
}
if local_deployment in [ " full " , " standard " ] :
if local_deployment in [ " full " , " standard " ] :
@ -198,36 +232,53 @@ def load_pipes(local_deployment):
# "device": device
# "device": device
# },
# },
" openai/whisper-base " : {
" openai/whisper-base " : {
" model " : pipeline ( task = " automatic-speech-recognition " , model = f " { local_fold } /openai/whisper-base " ) ,
" model " : pipeline (
" device " : device
task = " automatic-speech-recognition " ,
model = f " { local_fold } /openai/whisper-base " ,
) ,
" device " : device ,
} ,
} ,
" microsoft/speecht5_asr " : {
" microsoft/speecht5_asr " : {
" model " : pipeline ( task = " automatic-speech-recognition " , model = f " { local_fold } /microsoft/speecht5_asr " ) ,
" model " : pipeline (
" device " : device
task = " automatic-speech-recognition " ,
model = f " { local_fold } /microsoft/speecht5_asr " ,
) ,
" device " : device ,
} ,
} ,
" Intel/dpt-large " : {
" Intel/dpt-large " : {
" model " : pipeline ( task = " depth-estimation " , model = f " { local_fold } /Intel/dpt-large " ) ,
" model " : pipeline (
" device " : device
task = " depth-estimation " , model = f " { local_fold } /Intel/dpt-large "
) ,
" device " : device ,
} ,
} ,
# "microsoft/beit-base-patch16-224-pt22k-ft22k": {
# "microsoft/beit-base-patch16-224-pt22k-ft22k": {
# "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"),
# "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"),
# "device": device
# "device": device
# },
# },
" facebook/detr-resnet-50-panoptic " : {
" facebook/detr-resnet-50-panoptic " : {
" model " : pipeline ( task = " image-segmentation " , model = f " { local_fold } /facebook/detr-resnet-50-panoptic " ) ,
" model " : pipeline (
" device " : device
task = " image-segmentation " ,
model = f " { local_fold } /facebook/detr-resnet-50-panoptic " ,
) ,
" device " : device ,
} ,
} ,
" facebook/detr-resnet-101 " : {
" facebook/detr-resnet-101 " : {
" model " : pipeline ( task = " object-detection " , model = f " { local_fold } /facebook/detr-resnet-101 " ) ,
" model " : pipeline (
" device " : device
task = " object-detection " ,
model = f " { local_fold } /facebook/detr-resnet-101 " ,
) ,
" device " : device ,
} ,
} ,
# "openai/clip-vit-large-patch14": {
# "openai/clip-vit-large-patch14": {
# "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"),
# "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"),
# "device": device
# "device": device
# },
# },
" google/owlvit-base-patch32 " : {
" google/owlvit-base-patch32 " : {
" model " : pipeline ( task = " zero-shot-object-detection " , model = f " { local_fold } /google/owlvit-base-patch32 " ) ,
" model " : pipeline (
" device " : device
task = " zero-shot-object-detection " ,
model = f " { local_fold } /google/owlvit-base-patch32 " ,
) ,
" device " : device ,
} ,
} ,
# "microsoft/DialoGPT-medium": {
# "microsoft/DialoGPT-medium": {
# "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"),
# "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"),
@ -270,86 +321,121 @@ def load_pipes(local_deployment):
# "device": device
# "device": device
# },
# },
" impira/layoutlm-document-qa " : {
" impira/layoutlm-document-qa " : {
" model " : pipeline ( task = " document-question-answering " , model = f " { local_fold } /impira/layoutlm-document-qa " ) ,
" model " : pipeline (
" device " : device
task = " document-question-answering " ,
model = f " { local_fold } /impira/layoutlm-document-qa " ,
) ,
" device " : device ,
} ,
} ,
" ydshieh/vit-gpt2-coco-en " : {
" ydshieh/vit-gpt2-coco-en " : {
" model " : pipeline ( task = " image-to-text " , model = f " { local_fold } /ydshieh/vit-gpt2-coco-en " ) ,
" model " : pipeline (
" device " : device
task = " image-to-text " , model = f " { local_fold } /ydshieh/vit-gpt2-coco-en "
) ,
" device " : device ,
} ,
} ,
" dandelin/vilt-b32-finetuned-vqa " : {
" dandelin/vilt-b32-finetuned-vqa " : {
" model " : pipeline ( task = " visual-question-answering " , model = f " { local_fold } /dandelin/vilt-b32-finetuned-vqa " ) ,
" model " : pipeline (
" device " : device
task = " visual-question-answering " ,
}
model = f " { local_fold } /dandelin/vilt-b32-finetuned-vqa " ,
) ,
" device " : device ,
} ,
}
}
if local_deployment in [ " full " , " standard " , " minimal " ] :
if local_deployment in [ " full " , " standard " , " minimal " ] :
controlnet = ControlNetModel . from_pretrained ( f " { local_fold } /lllyasviel/sd-controlnet-canny " , torch_dtype = torch . float16 )
controlnet = ControlNetModel . from_pretrained (
f " { local_fold } /lllyasviel/sd-controlnet-canny " , torch_dtype = torch . float16
)
controlnetpipe = StableDiffusionControlNetPipeline . from_pretrained (
controlnetpipe = StableDiffusionControlNetPipeline . from_pretrained (
f " { local_fold } /runwayml/stable-diffusion-v1-5 " , controlnet = controlnet , torch_dtype = torch . float16
f " { local_fold } /runwayml/stable-diffusion-v1-5 " ,
controlnet = controlnet ,
torch_dtype = torch . float16 ,
)
)
def mlsd_control_network ( ) :
def mlsd_control_network ( ) :
model = MobileV2_MLSD_Large ( )
model = MobileV2_MLSD_Large ( )
model . load_state_dict ( torch . load ( f " { local_fold } /lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth " ) , strict = True )
model . load_state_dict (
torch . load (
f " { local_fold } /lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth "
) ,
strict = True ,
)
return MLSDdetector ( model )
return MLSDdetector ( model )
hed_network = Network ( f " { local_fold } /lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth " )
hed_network = Network (
f " { local_fold } /lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth "
)
controlnet_sd_pipes = {
controlnet_sd_pipes = {
" openpose-control " : {
" openpose-control " : {
" model " : OpenposeDetector ( Body ( f " { local_fold } /lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth " ) )
" model " : OpenposeDetector (
} ,
Body (
" mlsd-control " : {
f " { local_fold } /lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth "
" model " : mlsd_control_network ( )
)
} ,
)
" hed-control " : {
" model " : HEDdetector ( hed_network )
} ,
" scribble-control " : {
" model " : HEDdetector ( hed_network )
} ,
} ,
" mlsd-control " : { " model " : mlsd_control_network ( ) } ,
" hed-control " : { " model " : HEDdetector ( hed_network ) } ,
" scribble-control " : { " model " : HEDdetector ( hed_network ) } ,
" midas-control " : {
" midas-control " : {
" model " : MidasDetector ( model_path = f " { local_fold } /lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt " )
" model " : MidasDetector (
} ,
model_path = f " { local_fold } /lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt "
" canny-control " : {
)
" model " : CannyDetector ( )
} ,
} ,
" canny-control " : { " model " : CannyDetector ( ) } ,
" lllyasviel/sd-controlnet-canny " : {
" lllyasviel/sd-controlnet-canny " : {
" control " : controlnet ,
" control " : controlnet ,
" model " : controlnetpipe ,
" model " : controlnetpipe ,
" device " : device
" device " : device ,
} ,
} ,
" lllyasviel/sd-controlnet-depth " : {
" lllyasviel/sd-controlnet-depth " : {
" control " : ControlNetModel . from_pretrained ( f " { local_fold } /lllyasviel/sd-controlnet-depth " , torch_dtype = torch . float16 ) ,
" control " : ControlNetModel . from_pretrained (
f " { local_fold } /lllyasviel/sd-controlnet-depth " ,
torch_dtype = torch . float16 ,
) ,
" model " : controlnetpipe ,
" model " : controlnetpipe ,
" device " : device
" device " : device ,
} ,
} ,
" lllyasviel/sd-controlnet-hed " : {
" lllyasviel/sd-controlnet-hed " : {
" control " : ControlNetModel . from_pretrained ( f " { local_fold } /lllyasviel/sd-controlnet-hed " , torch_dtype = torch . float16 ) ,
" control " : ControlNetModel . from_pretrained (
f " { local_fold } /lllyasviel/sd-controlnet-hed " ,
torch_dtype = torch . float16 ,
) ,
" model " : controlnetpipe ,
" model " : controlnetpipe ,
" device " : device
" device " : device ,
} ,
} ,
" lllyasviel/sd-controlnet-mlsd " : {
" lllyasviel/sd-controlnet-mlsd " : {
" control " : ControlNetModel . from_pretrained ( f " { local_fold } /lllyasviel/sd-controlnet-mlsd " , torch_dtype = torch . float16 ) ,
" control " : ControlNetModel . from_pretrained (
f " { local_fold } /lllyasviel/sd-controlnet-mlsd " ,
torch_dtype = torch . float16 ,
) ,
" model " : controlnetpipe ,
" model " : controlnetpipe ,
" device " : device
" device " : device ,
} ,
} ,
" lllyasviel/sd-controlnet-openpose " : {
" lllyasviel/sd-controlnet-openpose " : {
" control " : ControlNetModel . from_pretrained ( f " { local_fold } /lllyasviel/sd-controlnet-openpose " , torch_dtype = torch . float16 ) ,
" control " : ControlNetModel . from_pretrained (
f " { local_fold } /lllyasviel/sd-controlnet-openpose " ,
torch_dtype = torch . float16 ,
) ,
" model " : controlnetpipe ,
" model " : controlnetpipe ,
" device " : device
" device " : device ,
} ,
} ,
" lllyasviel/sd-controlnet-scribble " : {
" lllyasviel/sd-controlnet-scribble " : {
" control " : ControlNetModel . from_pretrained ( f " { local_fold } /lllyasviel/sd-controlnet-scribble " , torch_dtype = torch . float16 ) ,
" control " : ControlNetModel . from_pretrained (
f " { local_fold } /lllyasviel/sd-controlnet-scribble " ,
torch_dtype = torch . float16 ,
) ,
" model " : controlnetpipe ,
" model " : controlnetpipe ,
" device " : device
" device " : device ,
} ,
} ,
" lllyasviel/sd-controlnet-seg " : {
" lllyasviel/sd-controlnet-seg " : {
" control " : ControlNetModel . from_pretrained ( f " { local_fold } /lllyasviel/sd-controlnet-seg " , torch_dtype = torch . float16 ) ,
" control " : ControlNetModel . from_pretrained (
f " { local_fold } /lllyasviel/sd-controlnet-seg " ,
torch_dtype = torch . float16 ,
) ,
" model " : controlnetpipe ,
" model " : controlnetpipe ,
" device " : device
" device " : device ,
}
} ,
}
}
pipes = { * * standard_pipes , * * other_pipes , * * controlnet_sd_pipes }
pipes = { * * standard_pipes , * * other_pipes , * * controlnet_sd_pipes }
return pipes
return pipes
@ -363,14 +449,17 @@ during = end - start
print ( f " [ ready ] { during } s " )
print ( f " [ ready ] { during } s " )
@app.route ( ' /running ' , methods = [ ' GET ' ] )
@app.route ( " /running " , methods = [ " GET " ] )
def running ( ) :
def running ( ) :
return jsonify ( { " running " : True } )
return jsonify ( { " running " : True } )
@app.route ( ' /status/<path:model_id> ' , methods = [ ' GET ' ] )
@app.route ( " /status/<path:model_id> " , methods = [ " GET " ] )
def status ( model_id ) :
def status ( model_id ) :
disabled_models = [ " microsoft/trocr-base-printed " , " microsoft/trocr-base-handwritten " ]
disabled_models = [
" microsoft/trocr-base-printed " ,
" microsoft/trocr-base-handwritten " ,
]
if model_id in pipes . keys ( ) and model_id not in disabled_models :
if model_id in pipes . keys ( ) and model_id not in disabled_models :
print ( f " [ check { model_id } ] success " )
print ( f " [ check { model_id } ] success " )
return jsonify ( { " loaded " : True } )
return jsonify ( { " loaded " : True } )
@ -379,7 +468,7 @@ def status(model_id):
return jsonify ( { " loaded " : False } )
return jsonify ( { " loaded " : False } )
@app.route ( ' /models/<path:model_id> ' , methods = [ ' POST ' ] )
@app.route ( " /models/<path:model_id> " , methods = [ " POST " ] )
def models ( model_id ) :
def models ( model_id ) :
while " using " in pipes [ model_id ] and pipes [ model_id ] [ " using " ] :
while " using " in pipes [ model_id ] and pipes [ model_id ] [ " using " ] :
print ( f " [ inference { model_id } ] waiting " )
print ( f " [ inference { model_id } ] waiting " )
@ -402,23 +491,29 @@ def models(model_id):
try :
try :
# text to video
# text to video
if model_id == " damo-vilab/text-to-video-ms-1.7b " :
if model_id == " damo-vilab/text-to-video-ms-1.7b " :
pipe . scheduler = DPMSolverMultistepScheduler . from_config ( pipe . scheduler . config )
pipe . scheduler = DPMSolverMultistepScheduler . from_config (
pipe . scheduler . config
)
# pipe.enable_model_cpu_offload()
# pipe.enable_model_cpu_offload()
prompt = request . get_json ( ) [ " text " ]
prompt = request . get_json ( ) [ " text " ]
video_frames = pipe ( prompt , num_inference_steps = 50 , num_frames = 40 ) . frames
video_frames = pipe ( prompt , num_inference_steps = 50 , num_frames = 40 ) . frames
video_path = export_to_video ( video_frames )
video_path = export_to_video ( video_frames )
file_name = str ( uuid . uuid4 ( ) ) [ : 4 ]
file_name = str ( uuid . uuid4 ( ) ) [ : 4 ]
os . system ( f " LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i { video_path } -vcodec libx264 public/videos/ { file_name } .mp4 " )
os . system (
f " LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i { video_path } -vcodec libx264 public/videos/ { file_name } .mp4 "
)
result = { " path " : f " /videos/ { file_name } .mp4 " }
result = { " path " : f " /videos/ { file_name } .mp4 " }
# controlnet
# controlnet
if model_id . startswith ( " lllyasviel/sd-controlnet- " ) :
if model_id . startswith ( " lllyasviel/sd-controlnet- " ) :
pipe . controlnet . to ( ' cpu ' )
pipe . controlnet . to ( " cpu " )
pipe . controlnet = pipes [ model_id ] [ " control " ] . to ( pipes [ model_id ] [ " device " ] )
pipe . controlnet = pipes [ model_id ] [ " control " ] . to ( pipes [ model_id ] [ " device " ] )
pipe . scheduler = UniPCMultistepScheduler . from_config ( pipe . scheduler . config )
pipe . scheduler = UniPCMultistepScheduler . from_config ( pipe . scheduler . config )
control_image = load_image ( request . get_json ( ) [ " img_url " ] )
control_image = load_image ( request . get_json ( ) [ " img_url " ] )
# generator = torch.manual_seed(66)
# generator = torch.manual_seed(66)
out_image : Image = pipe ( request . get_json ( ) [ " text " ] , num_inference_steps = 20 , image = control_image ) . images [ 0 ]
out_image : Image = pipe (
request . get_json ( ) [ " text " ] , num_inference_steps = 20 , image = control_image
) . images [ 0 ]
file_name = str ( uuid . uuid4 ( ) ) [ : 4 ]
file_name = str ( uuid . uuid4 ( ) ) [ : 4 ]
out_image . save ( f " public/images/ { file_name } .png " )
out_image . save ( f " public/images/ { file_name } .png " )
result = { " path " : f " /images/ { file_name } .png " }
result = { " path " : f " /images/ { file_name } .png " }
@ -441,17 +536,20 @@ def models(model_id):
file_name = str ( uuid . uuid4 ( ) ) [ : 4 ]
file_name = str ( uuid . uuid4 ( ) ) [ : 4 ]
with open ( f " public/images/ { file_name } .png " , " wb " ) as f :
with open ( f " public/images/ { file_name } .png " , " wb " ) as f :
f . write ( request . data )
f . write ( request . data )
tform = transforms . Compose ( [
tform = transforms . Compose (
transforms . ToTensor ( ) ,
[
transforms . Resize (
transforms . ToTensor ( ) ,
( 224 , 224 ) ,
transforms . Resize (
interpolation = transforms . InterpolationMode . BICUBIC ,
( 224 , 224 ) ,
antialias = False ,
interpolation = transforms . InterpolationMode . BICUBIC ,
) ,
antialias = False ,
transforms . Normalize (
) ,
[ 0.48145466 , 0.4578275 , 0.40821073 ] ,
transforms . Normalize (
[ 0.26862954 , 0.26130258 , 0.27577711 ] ) ,
[ 0.48145466 , 0.4578275 , 0.40821073 ] ,
] )
[ 0.26862954 , 0.26130258 , 0.27577711 ] ,
) ,
]
)
inp = tform ( im ) . to ( pipes [ model_id ] [ " device " ] ) . unsqueeze ( 0 )
inp = tform ( im ) . to ( pipes [ model_id ] [ " device " ] ) . unsqueeze ( 0 )
out = pipe ( inp , guidance_scale = 3 )
out = pipe ( inp , guidance_scale = 3 )
out [ " images " ] [ 0 ] . save ( f " public/images/ { file_name } .jpg " )
out [ " images " ] [ 0 ] . save ( f " public/images/ { file_name } .jpg " )
@ -459,30 +557,47 @@ def models(model_id):
# image to text
# image to text
if model_id == " Salesforce/blip-image-captioning-large " :
if model_id == " Salesforce/blip-image-captioning-large " :
raw_image = load_image ( request . get_json ( ) [ " img_url " ] ) . convert ( ' RGB ' )
raw_image = load_image ( request . get_json ( ) [ " img_url " ] ) . convert ( " RGB " )
text = request . get_json ( ) [ " text " ]
text = request . get_json ( ) [ " text " ]
inputs = pipes [ model_id ] [ " processor " ] ( raw_image , return_tensors = " pt " ) . to ( pipes [ model_id ] [ " device " ] )
inputs = pipes [ model_id ] [ " processor " ] ( raw_image , return_tensors = " pt " ) . to (
pipes [ model_id ] [ " device " ]
)
out = pipe . generate ( * * inputs )
out = pipe . generate ( * * inputs )
caption = pipes [ model_id ] [ " processor " ] . decode ( out [ 0 ] , skip_special_tokens = True )
caption = pipes [ model_id ] [ " processor " ] . decode (
out [ 0 ] , skip_special_tokens = True
)
result = { " generated text " : caption }
result = { " generated text " : caption }
if model_id == " ydshieh/vit-gpt2-coco-en " :
if model_id == " ydshieh/vit-gpt2-coco-en " :
img_url = request . get_json ( ) [ " img_url " ]
img_url = request . get_json ( ) [ " img_url " ]
generated_text = pipe ( img_url ) [ 0 ] [ ' generated_text ' ]
generated_text = pipe ( img_url ) [ 0 ] [ " generated_text " ]
result = { " generated text " : generated_text }
result = { " generated text " : generated_text }
if model_id == " nlpconnect/vit-gpt2-image-captioning " :
if model_id == " nlpconnect/vit-gpt2-image-captioning " :
image = load_image ( request . get_json ( ) [ " img_url " ] ) . convert ( " RGB " )
image = load_image ( request . get_json ( ) [ " img_url " ] ) . convert ( " RGB " )
pixel_values = pipes [ model_id ] [ " feature_extractor " ] ( images = image , return_tensors = " pt " ) . pixel_values
pixel_values = pipes [ model_id ] [ " feature_extractor " ] (
images = image , return_tensors = " pt "
) . pixel_values
pixel_values = pixel_values . to ( pipes [ model_id ] [ " device " ] )
pixel_values = pixel_values . to ( pipes [ model_id ] [ " device " ] )
generated_ids = pipe . generate ( pixel_values , * * { " max_length " : 200 , " num_beams " : 1 } )
generated_ids = pipe . generate (
generated_text = pipes [ model_id ] [ " tokenizer " ] . batch_decode ( generated_ids , skip_special_tokens = True ) [ 0 ]
pixel_values , * * { " max_length " : 200 , " num_beams " : 1 }
)
generated_text = pipes [ model_id ] [ " tokenizer " ] . batch_decode (
generated_ids , skip_special_tokens = True
) [ 0 ]
result = { " generated text " : generated_text }
result = { " generated text " : generated_text }
# image to text: OCR
# image to text: OCR
if model_id == " microsoft/trocr-base-printed " or model_id == " microsoft/trocr-base-handwritten " :
if (
model_id == " microsoft/trocr-base-printed "
or model_id == " microsoft/trocr-base-handwritten "
) :
image = load_image ( request . get_json ( ) [ " img_url " ] ) . convert ( " RGB " )
image = load_image ( request . get_json ( ) [ " img_url " ] ) . convert ( " RGB " )
pixel_values = pipes [ model_id ] [ " processor " ] ( image , return_tensors = " pt " ) . pixel_values
pixel_values = pipes [ model_id ] [ " processor " ] (
image , return_tensors = " pt "
) . pixel_values
pixel_values = pixel_values . to ( pipes [ model_id ] [ " device " ] )
pixel_values = pixel_values . to ( pipes [ model_id ] [ " device " ] )
generated_ids = pipe . generate ( pixel_values )
generated_ids = pipe . generate ( pixel_values )
generated_text = pipes [ model_id ] [ " processor " ] . batch_decode ( generated_ids , skip_special_tokens = True ) [ 0 ]
generated_text = pipes [ model_id ] [ " processor " ] . batch_decode (
generated_ids , skip_special_tokens = True
) [ 0 ]
result = { " generated text " : generated_text }
result = { " generated text " : generated_text }
# text to image
# text to image
@ -494,9 +609,87 @@ def models(model_id):
result = { " path " : f " /images/ { file_name } .jpg " }
result = { " path " : f " /images/ { file_name } .jpg " }
# object detection
# object detection
if model_id == " google/owlvit-base-patch32 " or model_id == " facebook/detr-resnet-101 " :
if (
model_id == " google/owlvit-base-patch32 "
or model_id == " facebook/detr-resnet-101 "
) :
img_url = request . get_json ( ) [ " img_url " ]
img_url = request . get_json ( ) [ " img_url " ]
open_types = [ " cat " , " couch " , " person " , " car " , " dog " , " horse " , " sheep " , " cow " , " elephant " , " bear " , " zebra " , " giraffe " , " backpack " , " umbrella " , " handbag " , " tie " , " suitcase " , " frisbee " , " skis " , " snowboard " , " sports ball " , " kite " , " baseball bat " , " baseball glove " , " skateboard " , " surfboard " , " tennis racket " , " bottle " , " wine glass " , " cup " , " fork " , " knife " , " spoon " , " bowl " , " banana " , " apple " , " sandwich " , " orange " , " broccoli " , " carrot " , " hot dog " , " pizza " , " donut " , " cake " , " chair " , " couch " , " potted plant " , " bed " , " dining table " , " toilet " , " tv " , " laptop " , " mouse " , " remote " , " keyboard " , " cell phone " , " microwave " , " oven " , " toaster " , " sink " , " refrigerator " , " book " , " clock " , " vase " , " scissors " , " teddy bear " , " hair drier " , " toothbrush " , " traffic light " , " fire hydrant " , " stop sign " , " parking meter " , " bench " , " bird " ]
open_types = [
" cat " ,
" couch " ,
" person " ,
" car " ,
" dog " ,
" horse " ,
" sheep " ,
" cow " ,
" elephant " ,
" bear " ,
" zebra " ,
" giraffe " ,
" backpack " ,
" umbrella " ,
" handbag " ,
" tie " ,
" suitcase " ,
" frisbee " ,
" skis " ,
" snowboard " ,
" sports ball " ,
" kite " ,
" baseball bat " ,
" baseball glove " ,
" skateboard " ,
" surfboard " ,
" tennis racket " ,
" bottle " ,
" wine glass " ,
" cup " ,
" fork " ,
" knife " ,
" spoon " ,
" bowl " ,
" banana " ,
" apple " ,
" sandwich " ,
" orange " ,
" broccoli " ,
" carrot " ,
" hot dog " ,
" pizza " ,
" donut " ,
" cake " ,
" chair " ,
" couch " ,
" potted plant " ,
" bed " ,
" dining table " ,
" toilet " ,
" tv " ,
" laptop " ,
" mouse " ,
" remote " ,
" keyboard " ,
" cell phone " ,
" microwave " ,
" oven " ,
" toaster " ,
" sink " ,
" refrigerator " ,
" book " ,
" clock " ,
" vase " ,
" scissors " ,
" teddy bear " ,
" hair drier " ,
" toothbrush " ,
" traffic light " ,
" fire hydrant " ,
" stop sign " ,
" parking meter " ,
" bench " ,
" bird " ,
]
result = pipe ( img_url , candidate_labels = open_types )
result = pipe ( img_url , candidate_labels = open_types )
# VQA
# VQA
@ -514,14 +707,16 @@ def models(model_id):
# depth-estimation
# depth-estimation
if model_id == " Intel/dpt-large " :
if model_id == " Intel/dpt-large " :
output = pipe ( request . get_json ( ) [ " img_url " ] )
output = pipe ( request . get_json ( ) [ " img_url " ] )
image = output [ ' depth ' ]
image = output [ " depth " ]
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
image . save ( f " public/images/ { name } .jpg " )
image . save ( f " public/images/ { name } .jpg " )
result = { " path " : f " /images/ { name } .jpg " }
result = { " path " : f " /images/ { name } .jpg " }
if model_id == " Intel/dpt-hybrid-midas " and model_id == " Intel/dpt-large " :
if model_id == " Intel/dpt-hybrid-midas " and model_id == " Intel/dpt-large " :
image = load_image ( request . get_json ( ) [ " img_url " ] )
image = load_image ( request . get_json ( ) [ " img_url " ] )
inputs = pipes [ model_id ] [ " feature_extractor " ] ( images = image , return_tensors = " pt " )
inputs = pipes [ model_id ] [ " feature_extractor " ] (
images = image , return_tensors = " pt "
)
with torch . no_grad ( ) :
with torch . no_grad ( ) :
outputs = pipe ( * * inputs )
outputs = pipe ( * * inputs )
predicted_depth = outputs . predicted_depth
predicted_depth = outputs . predicted_depth
@ -550,11 +745,21 @@ def models(model_id):
text = request . get_json ( ) [ " text " ]
text = request . get_json ( ) [ " text " ]
inputs = pipes [ model_id ] [ " processor " ] ( text = text , return_tensors = " pt " )
inputs = pipes [ model_id ] [ " processor " ] ( text = text , return_tensors = " pt " )
embeddings_dataset = pipes [ model_id ] [ " embeddings_dataset " ]
embeddings_dataset = pipes [ model_id ] [ " embeddings_dataset " ]
speaker_embeddings = torch . tensor ( embeddings_dataset [ 7306 ] [ " xvector " ] ) . unsqueeze ( 0 ) . to ( pipes [ model_id ] [ " device " ] )
speaker_embeddings = (
torch . tensor ( embeddings_dataset [ 7306 ] [ " xvector " ] )
. unsqueeze ( 0 )
. to ( pipes [ model_id ] [ " device " ] )
)
pipes [ model_id ] [ " vocoder " ] . to ( pipes [ model_id ] [ " device " ] )
pipes [ model_id ] [ " vocoder " ] . to ( pipes [ model_id ] [ " device " ] )
speech = pipe . generate_speech ( inputs [ " input_ids " ] . to ( pipes [ model_id ] [ " device " ] ) , speaker_embeddings , vocoder = pipes [ model_id ] [ " vocoder " ] )
speech = pipe . generate_speech (
inputs [ " input_ids " ] . to ( pipes [ model_id ] [ " device " ] ) ,
speaker_embeddings ,
vocoder = pipes [ model_id ] [ " vocoder " ] ,
)
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
sf . write ( f " public/audios/ { name } .wav " , speech . cpu ( ) . numpy ( ) , samplerate = 16000 )
sf . write (
f " public/audios/ { name } .wav " , speech . cpu ( ) . numpy ( ) , samplerate = 16000
)
result = { " path " : f " /audios/ { name } .wav " }
result = { " path " : f " /audios/ { name } .wav " }
# ASR
# ASR
@ -569,19 +774,31 @@ def models(model_id):
with torch . no_grad ( ) :
with torch . no_grad ( ) :
result_wav = pipe ( wav . to ( pipes [ model_id ] [ " device " ] ) )
result_wav = pipe ( wav . to ( pipes [ model_id ] [ " device " ] ) )
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
sf . write ( f " public/audios/ { name } .wav " , result_wav . cpu ( ) . squeeze ( ) . numpy ( ) , sr )
sf . write (
f " public/audios/ { name } .wav " , result_wav . cpu ( ) . squeeze ( ) . numpy ( ) , sr
)
result = { " path " : f " /audios/ { name } .wav " }
result = { " path " : f " /audios/ { name } .wav " }
if model_id == " microsoft/speecht5_vc " :
if model_id == " microsoft/speecht5_vc " :
audio_url = request . get_json ( ) [ " audio_url " ]
audio_url = request . get_json ( ) [ " audio_url " ]
wav , sr = torchaudio . load ( audio_url )
wav , sr = torchaudio . load ( audio_url )
inputs = pipes [ model_id ] [ " processor " ] ( audio = wav , sampling_rate = sr , return_tensors = " pt " )
inputs = pipes [ model_id ] [ " processor " ] (
audio = wav , sampling_rate = sr , return_tensors = " pt "
)
embeddings_dataset = pipes [ model_id ] [ " embeddings_dataset " ]
embeddings_dataset = pipes [ model_id ] [ " embeddings_dataset " ]
speaker_embeddings = torch . tensor ( embeddings_dataset [ 7306 ] [ " xvector " ] ) . unsqueeze ( 0 )
speaker_embeddings = torch . tensor (
embeddings_dataset [ 7306 ] [ " xvector " ]
) . unsqueeze ( 0 )
pipes [ model_id ] [ " vocoder " ] . to ( pipes [ model_id ] [ " device " ] )
pipes [ model_id ] [ " vocoder " ] . to ( pipes [ model_id ] [ " device " ] )
speech = pipe . generate_speech ( inputs [ " input_ids " ] . to ( pipes [ model_id ] [ " device " ] ) , speaker_embeddings , vocoder = pipes [ model_id ] [ " vocoder " ] )
speech = pipe . generate_speech (
inputs [ " input_ids " ] . to ( pipes [ model_id ] [ " device " ] ) ,
speaker_embeddings ,
vocoder = pipes [ model_id ] [ " vocoder " ] ,
)
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
sf . write ( f " public/audios/ { name } .wav " , speech . cpu ( ) . numpy ( ) , samplerate = 16000 )
sf . write (
f " public/audios/ { name } .wav " , speech . cpu ( ) . numpy ( ) , samplerate = 16000
)
result = { " path " : f " /audios/ { name } .wav " }
result = { " path " : f " /audios/ { name } .wav " }
# segmentation
# segmentation
@ -592,24 +809,44 @@ def models(model_id):
colors = [ ]
colors = [ ]
for i in range ( len ( segments ) ) :
for i in range ( len ( segments ) ) :
colors . append ( ( random . randint ( 100 , 255 ) , random . randint ( 100 , 255 ) , random . randint ( 100 , 255 ) , 50 ) )
colors . append (
(
random . randint ( 100 , 255 ) ,
random . randint ( 100 , 255 ) ,
random . randint ( 100 , 255 ) ,
50 ,
)
)
for segment in segments :
for segment in segments :
mask = segment [ " mask " ]
mask = segment [ " mask " ]
mask = mask . convert ( ' L ' )
mask = mask . convert ( " L " )
layer = Image . new ( ' RGBA ' , mask . size , colors [ i ] )
layer = Image . new ( " RGBA " , mask . size , colors [ i ] )
image . paste ( layer , ( 0 , 0 ) , mask )
image . paste ( layer , ( 0 , 0 ) , mask )
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
image . save ( f " public/images/ { name } .jpg " )
image . save ( f " public/images/ { name } .jpg " )
result = { " path " : f " /images/ { name } .jpg " }
result = { " path " : f " /images/ { name } .jpg " }
if model_id == " facebook/maskformer-swin-base-coco " or model_id == " facebook/maskformer-swin-large-ade " :
if (
model_id == " facebook/maskformer-swin-base-coco "
or model_id == " facebook/maskformer-swin-large-ade "
) :
image = load_image ( request . get_json ( ) [ " img_url " ] )
image = load_image ( request . get_json ( ) [ " img_url " ] )
inputs = pipes [ model_id ] [ " feature_extractor " ] ( images = image , return_tensors = " pt " ) . to ( pipes [ model_id ] [ " device " ] )
inputs = pipes [ model_id ] [ " feature_extractor " ] (
images = image , return_tensors = " pt "
) . to ( pipes [ model_id ] [ " device " ] )
outputs = pipe ( * * inputs )
outputs = pipe ( * * inputs )
result = pipes [ model_id ] [ " feature_extractor " ] . post_process_panoptic_segmentation ( outputs , target_sizes = [ image . size [ : : - 1 ] ] ) [ 0 ]
result = pipes [ model_id ] [
" feature_extractor "
] . post_process_panoptic_segmentation (
outputs , target_sizes = [ image . size [ : : - 1 ] ]
) [
0
]
predicted_panoptic_map = result [ " segmentation " ] . cpu ( ) . numpy ( )
predicted_panoptic_map = result [ " segmentation " ] . cpu ( ) . numpy ( )
predicted_panoptic_map = Image . fromarray ( predicted_panoptic_map . astype ( np . uint8 ) )
predicted_panoptic_map = Image . fromarray (
predicted_panoptic_map . astype ( np . uint8 )
)
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
name = str ( uuid . uuid4 ( ) ) [ : 4 ]
predicted_panoptic_map . save ( f " public/images/ { name } .jpg " )
predicted_panoptic_map . save ( f " public/images/ { name } .jpg " )
result = { " path " : f " /images/ { name } .jpg " }
result = { " path " : f " /images/ { name } .jpg " }
@ -641,7 +878,7 @@ def models(model_id):
return jsonify ( result )
return jsonify ( result )
if __name__ == ' __main__ ' :
if __name__ == " __main__ " :
# temp folders
# temp folders
if not os . path . exists ( " public/audios " ) :
if not os . path . exists ( " public/audios " ) :
os . makedirs ( " public/audios " )
os . makedirs ( " public/audios " )