Skip to content

vllm.entrypoints.serve.elastic_ep.middleware ¶

_scaling_elastic_ep `module-attribute` ¶

_scaling_elastic_ep = False

ScalingMiddleware ¶

Middleware that checks if the model is currently scaling and returns a 503 Service Unavailable response if it is.

This middleware applies to all HTTP requests and prevents processing when the model is in a scaling state.

Source code in vllm/entrypoints/serve/elastic_ep/middleware.py

class ScalingMiddleware:
    """
    Middleware that checks if the model is currently scaling and
    returns a 503 Service Unavailable response if it is.

    This middleware applies to all HTTP requests and prevents
    processing when the model is in a scaling state.
    """

    def __init__(self, app: ASGIApp) -> None:
        self.app = app

    def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
        if scope["type"] != "http":
            return self.app(scope, receive, send)

        # Check global scaling state
        if get_scaling_elastic_ep():
            # Return 503 Service Unavailable response
            response = JSONResponse(
                content={
                    "error": "The model is currently scaling. Please try again later."
                },
                status_code=503,
            )
            return response(scope, receive, send)

        return self.app(scope, receive, send)

app `instance-attribute` ¶

app = app

call ¶

__call__(
    scope: Scope, receive: Receive, send: Send
) -> Awaitable[None]

Source code in vllm/entrypoints/serve/elastic_ep/middleware.py

def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
    if scope["type"] != "http":
        return self.app(scope, receive, send)

    # Check global scaling state
    if get_scaling_elastic_ep():
        # Return 503 Service Unavailable response
        response = JSONResponse(
            content={
                "error": "The model is currently scaling. Please try again later."
            },
            status_code=503,
        )
        return response(scope, receive, send)

    return self.app(scope, receive, send)

init ¶

__init__(app: ASGIApp) -> None

Source code in vllm/entrypoints/serve/elastic_ep/middleware.py

def __init__(self, app: ASGIApp) -> None:
    self.app = app

get_scaling_elastic_ep ¶

get_scaling_elastic_ep()

Source code in vllm/entrypoints/serve/elastic_ep/middleware.py

def get_scaling_elastic_ep():
    return _scaling_elastic_ep

set_scaling_elastic_ep ¶

set_scaling_elastic_ep(value)

Source code in vllm/entrypoints/serve/elastic_ep/middleware.py

def set_scaling_elastic_ep(value):
    global _scaling_elastic_ep
    _scaling_elastic_ep = value