<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Vision Language Model Archives - Urban Geo Analytics</title>
	<atom:link href="https://urbangeoanalytics.com/category/vision-language-model/feed/" rel="self" type="application/rss+xml" />
	<link>https://urbangeoanalytics.com/category/vision-language-model/</link>
	<description>Spatial Analysis, GeoAI &#38; Machine Learning</description>
	<lastBuildDate>Thu, 21 May 2026 10:27:47 +0000</lastBuildDate>
	<language>en-US</language>
	<sy:updatePeriod>
	hourly	</sy:updatePeriod>
	<sy:updateFrequency>
	1	</sy:updateFrequency>
	<generator>https://wordpress.org/?v=7.0</generator>

<image>
	<url>https://urbangeoanalytics.com/wp-content/uploads/2025/11/cropped-logo-urban-geo_512-32x32.png</url>
	<title>Vision Language Model Archives - Urban Geo Analytics</title>
	<link>https://urbangeoanalytics.com/category/vision-language-model/</link>
	<width>32</width>
	<height>32</height>
</image> 
	<item>
		<title>SAGAI v2.0 — A Unified Multi-Model Notebook for Streetscape Analysis</title>
		<link>https://urbangeoanalytics.com/sagai-v2-multi-model-streetscape-analysis-uvlm/</link>
					<comments>https://urbangeoanalytics.com/sagai-v2-multi-model-streetscape-analysis-uvlm/#respond</comments>
		
		<dc:creator><![CDATA[Joan Perez]]></dc:creator>
		<pubDate>Thu, 21 May 2026 10:11:18 +0000</pubDate>
				<category><![CDATA[Advanced]]></category>
		<category><![CDATA[Python]]></category>
		<category><![CDATA[Vision Language Model]]></category>
		<category><![CDATA[AI]]></category>
		<category><![CDATA[GIS]]></category>
		<category><![CDATA[Image Analysis]]></category>
		<category><![CDATA[Llava]]></category>
		<category><![CDATA[Qwen]]></category>
		<category><![CDATA[UVLM]]></category>
		<guid isPermaLink="false">https://urbangeoanalytics.com/?p=2483</guid>

					<description><![CDATA[<p>SAGAI v2.0 consolidates the full streetscape analysis pipeline into a single Google Colab notebook and replaces the inline LLaVA-only inference code with the UVLM package, enabling multi-model benchmarking across 11 VLM checkpoints. New features include a multi-task prompt builder, consensus validation with majority voting, chain-of-thought reasoning, truncation detection, interactive Folium maps, view-direction filtering, and support for loading existing polygons as study area boundaries.</p>
<p>The post <a href="https://urbangeoanalytics.com/sagai-v2-multi-model-streetscape-analysis-uvlm/">SAGAI v2.0 — A Unified Multi-Model Notebook for Streetscape Analysis</a> appeared first on <a href="https://urbangeoanalytics.com">Urban Geo Analytics</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="fusion-fullwidth fullwidth-box fusion-builder-row-1 fusion-flex-container has-pattern-background has-mask-background nonhundred-percent-fullwidth non-hundred-percent-height-scrolling" style="--awb-border-radius-top-left:0px;--awb-border-radius-top-right:0px;--awb-border-radius-bottom-right:0px;--awb-border-radius-bottom-left:0px;--awb-flex-wrap:wrap;" id="contenu" ><div class="fusion-builder-row fusion-row fusion-flex-align-items-flex-start fusion-flex-content-wrap" style="max-width:1248px;margin-left: calc(-4% / 2 );margin-right: calc(-4% / 2 );"><div class="fusion-layout-column fusion_builder_column fusion-builder-column-0 fusion_builder_column_3_4 3_4 fusion-flex-column" style="--awb-bg-size:cover;--awb-width-large:75%;--awb-margin-top-large:0px;--awb-spacing-right-large:2.56%;--awb-margin-bottom-large:20px;--awb-spacing-left-large:2.56%;--awb-width-medium:75%;--awb-order-medium:0;--awb-spacing-right-medium:2.56%;--awb-spacing-left-medium:2.56%;--awb-width-small:100%;--awb-order-small:0;--awb-spacing-right-small:1.92%;--awb-spacing-left-small:1.92%;" id="contenu" data-scroll-devices="small-visibility,medium-visibility,large-visibility"><div class="fusion-column-wrapper fusion-column-has-shadow fusion-flex-justify-content-flex-start fusion-content-layout-column"><div class="fusion-image-element awb-imageframe-style awb-imageframe-style-below awb-imageframe-style-1" style="text-align:center;--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--body_typography-font-family);--awb-caption-title-font-weight:var(--body_typography-font-weight);--awb-caption-title-font-style:var(--body_typography-font-style);--awb-caption-title-size:var(--body_typography-font-size);--awb-caption-title-transform:var(--body_typography-text-transform);--awb-caption-title-line-height:var(--body_typography-line-height);--awb-caption-title-letter-spacing:var(--body_typography-letter-spacing);"><span class=" fusion-imageframe imageframe-none imageframe-1 hover-type-none"><img fetchpriority="high" decoding="async" width="1760" height="545" title="e4e3b0b4-83a7-4933-ba0b-ef1775beacc6" src="https://urbangeoanalytics.com/wp-content/uploads/2026/05/e4e3b0b4-83a7-4933-ba0b-ef1775beacc6.png" alt class="img-responsive wp-image-2489" srcset="https://urbangeoanalytics.com/wp-content/uploads/2026/05/e4e3b0b4-83a7-4933-ba0b-ef1775beacc6-200x62.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2026/05/e4e3b0b4-83a7-4933-ba0b-ef1775beacc6-400x124.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2026/05/e4e3b0b4-83a7-4933-ba0b-ef1775beacc6-600x186.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2026/05/e4e3b0b4-83a7-4933-ba0b-ef1775beacc6-800x248.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2026/05/e4e3b0b4-83a7-4933-ba0b-ef1775beacc6-1200x372.png 1200w, https://urbangeoanalytics.com/wp-content/uploads/2026/05/e4e3b0b4-83a7-4933-ba0b-ef1775beacc6.png 1760w" sizes="(max-width: 640px) 100vw, 1200px" /></span><div class="awb-imageframe-caption-container" style="text-align:center;"><div class="awb-imageframe-caption"><div class="awb-imageframe-caption-title"> </div></div></div></div><div class="fusion-text fusion-text-1"><h5><strong>Highlights</strong></h5>
</div><div class="fusion-text fusion-text-2" style="--awb-margin-top:-30px;"><ul>
<li>SAGAI v2.0 merges the previous four-module notebook architecture into a <strong>single unified Google Colab notebook</strong> (SAGAI.ipynb) organized in six sequential blocks.</li>
<li>The inline LLaVA-only inference code is replaced by the <strong>UVLM package</strong> (Universal Vision-Language Model Loader), installed automatically from GitHub, providing access to <strong>11 VLM checkpoints</strong> across two model families.</li>
<li>New capabilities include a <strong>multi-task prompt builder</strong>, <strong>consensus validation</strong> with majority voting, <strong>chain-of-thought reasoning</strong>, <strong>truncation detection</strong>, <strong>interactive Folium maps</strong>, <strong>view-direction filtering</strong>, and support for <strong>loading an existing study area polygon</strong>.</li>
</ul>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-1 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">Introduction</h2></div><div class="fusion-text fusion-text-3 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">SAGAI (Streetscape Analysis with Generative Artificial Intelligence) is an open-source workflow for scoring and mapping street-level urban environments using vision-language models and open geospatial data. Since its initial release, SAGAI has been structured as a set of independent Colab notebooks, one per pipeline stage, each relying on its own dependencies and documentation.</p>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">SAGAI v2.0 is a major release that consolidates the entire pipeline into a single notebook and replaces the custom inference code with the UVLM package. Where previous versions were tied to a single LLaVA checkpoint with handwritten inference logic, SAGAI v2.0 delegates all vision-language model loading, prompting, and evaluation to UVLM&#8217;s unified interface. This makes the scoring engine model-agnostic: users can select from 11 VLM checkpoints spanning the LLaVA-NeXT and Qwen2.5-VL families, compare their performance on identical tasks, and benefit from features such as consensus validation, reasoning traces, and truncation diagnostics; all within the same notebook.</p>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">Beyond the inference engine, v2.0 introduces structural and functional changes across the entire pipeline: a unified six-block architecture, interactive HTML mapping via Folium, view-direction filtering for aggregation, and the ability to load an existing polygon as a study area boundary instead of defining a bounding box manually.</p>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">This post details the architectural changes, the UVLM integration, and the new features introduced in SAGAI v2.0.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-2 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">1. From Four Notebooks to One: The Unified Architecture</h2></div><div class="fusion-text fusion-text-4 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">Previous SAGAI releases were organized as four independent Colab notebooks — one for street sampling, one for image retrieval, one for VLM inference, and one for aggregation and mapping — each accompanied by a separate NOTICE file documenting its dependencies and usage. This modular design was useful for development but introduced friction in practice: users had to manage file paths between notebooks, track four separate environments, and consult multiple documentation files.</p>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">SAGAI v2.0 merges all four stages into a single notebook (SAGAI.ipynb) structured as six sequential blocks. The pipeline flows from study area definition through street sampling, image downloading, VLM scoring, and mapping, with all intermediate data passed directly between blocks in the same runtime session. The separate per-module NOTICE files and the standalone requirements file (requirements_sagai_module_3_v1-0.txt) have been removed — dependency management is now handled automatically by the UVLM package installation.</p>
</div><div class="fusion-image-element awb-imageframe-style awb-imageframe-style-below awb-imageframe-style-2" style="text-align:center;--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--body_typography-font-family);--awb-caption-title-font-weight:var(--body_typography-font-weight);--awb-caption-title-font-style:var(--body_typography-font-style);--awb-caption-title-size:var(--body_typography-font-size);--awb-caption-title-transform:var(--body_typography-text-transform);--awb-caption-title-line-height:var(--body_typography-line-height);--awb-caption-title-letter-spacing:var(--body_typography-letter-spacing);"><span class=" fusion-imageframe imageframe-none imageframe-2 hover-type-none"><img decoding="async" width="2000" height="948" title="pipeline details" src="https://urbangeoanalytics.com/wp-content/uploads/2026/05/pipeline-details-scaled.png" alt class="img-responsive wp-image-2480" srcset="https://urbangeoanalytics.com/wp-content/uploads/2026/05/pipeline-details-300x142.png 300w, https://urbangeoanalytics.com/wp-content/uploads/2026/05/pipeline-details-768x364.png 768w, https://urbangeoanalytics.com/wp-content/uploads/2026/05/pipeline-details-1024x486.png 1024w, https://urbangeoanalytics.com/wp-content/uploads/2026/05/pipeline-details-1536x728.png 1536w, https://urbangeoanalytics.com/wp-content/uploads/2026/05/pipeline-details-scaled.png 2000w" sizes="(max-width: 2000px) 100vw, 2000px" /></span><div class="awb-imageframe-caption-container" style="text-align:center;"><div class="awb-imageframe-caption"><div class="awb-imageframe-caption-title">Diagram of the six-block architecture</div></div></div></div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-3 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">2. Study Area Definition: Bounding Box or Existing Polygon</h2></div><div class="fusion-text fusion-text-5 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">In previous versions, the study area was defined exclusively by a bounding box in WGS84 coordinates. SAGAI v2.0 retains this option but adds the ability to draw your own polygon or to load an existing polygon; for example, a GeoPackage representing a neighborhood, municipality, or custom boundary. When a polygon is provided, the street sampling step extracts the OpenStreetMap network within that geometry rather than a rectangular extent. This makes it straightforward to work with irregular administrative boundaries or user-defined study zones without manually computing bounding coordinates.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-4 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">3. UVLM Integration: From Single-Model Inference to Multi-Model Benchmarking</h2></div><div class="fusion-text fusion-text-6 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">The most significant change in SAGAI v2.0 is the replacement of the inline inference code with the <a class="keychainify-checked" href="https://github.com/perezjoan/UVLM/tree/main">UVLM package</a>. In previous versions, Blocks 3 through 5 contained custom code for loading a single LLaVA checkpoint, constructing prompts, running inference, and parsing outputs. This logic was tightly coupled to one model architecture and required manual maintenance when Hugging Face APIs or model formats changed.</p>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">SAGAI v2.0 installs UVLM directly from its GitHub repository at the start of the notebook. All model loading, prompt formatting, inference execution, response parsing, and batch processing are delegated to UVLM&#8217;s API. The inline inference code has been entirely removed.</p>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">Through UVLM, SAGAI v2.0 supports 11 VLM checkpoints across two model families:</p>
<ul class="&#091;li_&amp;&#093;:mb-0 &#091;li_&amp;&#093;:mt-1 &#091;li_&amp;&#093;:gap-1 &#091;&amp;:not(:last-child)_ul&#093;:pb-1 &#091;&amp;:not(:last-child)_ol&#093;:pb-1 list-disc flex flex-col gap-1 pl-8 mb-3">
<li class="font-claude-response-body whitespace-normal break-words pl-2"><strong>LLaVA-NeXT</strong> — Mistral 7B, Vicuna 7B, Vicuna 13B, 34B, LLaMA3 8B, 72B, 110B</li>
<li class="font-claude-response-body whitespace-normal break-words pl-2"><strong>Qwen2.5-VL</strong> — 3B Instruct, 7B Instruct, 32B Instruct, 72B Instruct</li>
</ul>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">UVLM&#8217;s dual-backend abstraction automatically detects the model family and routes inference to the correct pipeline — LlavaNextProcessor for LLaVA models, AutoProcessor with process_vision_info for Qwen models — so users switch between architectures by changing a single model selection, with no modification to the rest of the notebook.</p>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">Quantization is handled through UVLM&#8217;s built-in support for 4-bit, 8-bit, and FP16 precision via BitsAndBytes. Models up to 34B parameters can run on a single Colab GPU (T4 or A100) with 4-bit quantization.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-5 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">4. Multi-Task Prompt Builder</h2></div><div class="fusion-text fusion-text-7 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">UVLM provides a widget-based prompt builder that SAGAI v2.0 exposes directly in the notebook. Users can define up to 10 analysis tasks per run, each with its own prompt, response type (numeric, category, boolean, or text), and label. This replaces the previous approach of selecting from a small set of hardcoded tasks (T1, T2, T3) or manually editing prompt strings in the code.</p>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">Tasks are configured interactively before execution and applied uniformly across all images in the batch. Each task produces its own column in the output CSV file.</p>
</div><div class="fusion-image-element awb-imageframe-style awb-imageframe-style-below awb-imageframe-style-3" style="text-align:center;--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--body_typography-font-family);--awb-caption-title-font-weight:var(--body_typography-font-weight);--awb-caption-title-font-style:var(--body_typography-font-style);--awb-caption-title-size:var(--body_typography-font-size);--awb-caption-title-transform:var(--body_typography-text-transform);--awb-caption-title-line-height:var(--body_typography-line-height);--awb-caption-title-letter-spacing:var(--body_typography-letter-spacing);"><span class=" fusion-imageframe imageframe-none imageframe-3 hover-type-none"><img decoding="async" width="866" height="1063" title="image2" src="https://urbangeoanalytics.com/wp-content/uploads/2026/03/image2.png" alt class="img-responsive wp-image-2320" srcset="https://urbangeoanalytics.com/wp-content/uploads/2026/03/image2-200x245.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/image2-400x491.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/image2-600x736.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/image2-800x982.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/image2.png 866w" sizes="(max-width: 640px) 100vw, 866px" /></span><div class="awb-imageframe-caption-container" style="text-align:center;"><div class="awb-imageframe-caption"><div class="awb-imageframe-caption-title">UVLM prompt builder</div></div></div></div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-6 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">5. Consensus Validation</h2></div><div class="fusion-text fusion-text-8 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">SAGAI v2.0 inherits UVLM&#8217;s consensus validation mechanism. Each analysis task can be run 2 to 5 times per image, and the final score is determined by majority voting across the repeated inferences. NA values from failed parses are filtered before voting. An agreement ratio is recorded alongside the final score, providing a built-in measure of prediction reliability without any external validation step.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-7 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">6. Chain-of-Thought Reasoning and Truncation Detection</h2></div><div class="fusion-text fusion-text-9 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">UVLM supports two approaches to chain-of-thought (CoT) reasoning, both available in SAGAI v2.0. Users can write task prompts that explicitly request step-by-step reasoning and adjust the token budget (up to 1,500 tokens) to allow the model sufficient generation space. Alternatively, a built-in CoT reference mode can be enabled per task, which triggers a standardized reasoning template with a fixed 1,024-token budget. In both cases, the reasoning trace is stored in a dedicated column in the output CSV for inspection.</p>
<p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">Truncation detection is performed automatically after every inference call. The exact number of generated tokens is compared against the token limit, and truncated responses are flagged in per-task CSV columns. This allows users to identify tasks where the token budget is insufficient without post-hoc analysis.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-8 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">7. Interactive Mapping with Folium</h2></div><div class="fusion-text fusion-text-10 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">Previous SAGAI versions generated static thematic maps using Matplotlib. SAGAI v2.0 replaces these with interactive HTML maps built with Folium. Point-level and street-segment-level scores are rendered as interactive layers that can be panned, zoomed, and queried directly in the browser. This is particularly useful for exploratory analysis and for sharing results with collaborators who do not use GIS software.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-9 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">8. View-Direction Filtering for Aggregation</h2></div><div class="fusion-text fusion-text-11 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">Google Street View images are typically downloaded in multiple compass directions at each sampling point (e.g., front, back, left, right). In previous versions, all views were aggregated together when computing point- or street-level scores. SAGAI v2.0 introduces a view filter that allows users to select which directions to include in the aggregation — for example, scoring only left-side and right-side views to focus on building facades, or only front views to capture the pedestrian perspective along the street axis. This filter is applied at the aggregation stage and does not affect the scoring step itself.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-10 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">9. Resume-Safe Batch Processing</h2></div><div class="fusion-text fusion-text-12 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p class="font-claude-response-body break-words whitespace-normal leading-&#091;1.7&#093;">The batch execution engine inherited from UVLM provides resume-safe processing with checkpoint saving every 3 images. If a Colab session is interrupted — due to a timeout, a runtime reset, or a connectivity issue — the notebook can be re-executed and will automatically skip already-processed images. New tasks added between runs trigger automatic CSV schema upgrading, so the output file grows incrementally without losing previous results.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-11 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">10. References and Links</h2></div><div class="fusion-text fusion-text-13 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><ul>
<li class="font-claude-response-body whitespace-normal break-words pl-2">SAGAI v2.0 on GitHub: <a class="underline underline-offset-2 decoration-1 decoration-current/40 hover:decoration-current focus:decoration-current keychainify-checked" href="https://github.com/perezjoan/SAGAI">https://github.com/perezjoan/SAGAI</a></li>
<li class="font-claude-response-body whitespace-normal break-words pl-2">UVLM on GitHub: <a class="underline underline-offset-2 decoration-1 decoration-current/40 hover:decoration-current focus:decoration-current keychainify-checked" href="https://github.com/perezjoan/UVLM">https://github.com/perezjoan/UVLM</a></li>
<li class="font-claude-response-body whitespace-normal break-words pl-2">Perez, J. and Fusco, G. (2025). <em>Streetscape Analysis with Generative AI (SAGAI): Vision-Language Assessment and Mapping of Urban Scenes.</em> Geomatica, 77(2), 100063. <a class="underline underline-offset-2 decoration-1 decoration-current/40 hover:decoration-current focus:decoration-current keychainify-checked" href="https://www.sciencedirect.com/science/article/pii/S1195103625000199">https://www.sciencedirect.com/science/article/pii/S1195103625000199</a></li>
<li class="font-claude-response-body whitespace-normal break-words pl-2">Perez, J. and Fusco, G. (2026). <em>UVLM: A Universal Vision-Language Model Loader for Reproducible Multimodal Benchmarking.</em> arXiv:2603.13893. <a class="underline underline-offset-2 decoration-1 decoration-current/40 hover:decoration-current focus:decoration-current keychainify-checked" href="https://arxiv.org/abs/2603.13893">https://arxiv.org/abs/2603.13893</a></li>
</ul>
</div></div></div><div class="fusion-layout-column fusion_builder_column fusion-builder-column-1 awb-sticky awb-sticky-medium awb-sticky-large fusion_builder_column_1_4 1_4 fusion-flex-column" style="--awb-padding-top:20px;--awb-padding-right:20px;--awb-padding-bottom:20px;--awb-padding-left:20px;--awb-bg-size:cover;--awb-border-color:var(--awb-color6);--awb-border-style:solid;--awb-width-large:25%;--awb-margin-top-large:0px;--awb-spacing-right-large:7.68%;--awb-margin-bottom-large:20px;--awb-spacing-left-large:7.68%;--awb-width-medium:25%;--awb-order-medium:0;--awb-spacing-right-medium:7.68%;--awb-spacing-left-medium:7.68%;--awb-width-small:100%;--awb-order-small:0;--awb-spacing-right-small:1.92%;--awb-spacing-left-small:1.92%;--awb-sticky-offset:150px;" data-scroll-devices="small-visibility,medium-visibility,large-visibility"><div class="fusion-column-wrapper fusion-column-has-shadow fusion-flex-justify-content-flex-start fusion-content-layout-column"><div class="fusion-text fusion-text-14"><p><span style="color: #143c4e;"><strong>Table of contents</strong></span></p>
</div><div class="awb-toc-el awb-toc-el--1" data-awb-toc-id="1" data-awb-toc-options="{&quot;allowed_heading_tags&quot;:{&quot;h2&quot;:0},&quot;ignore_headings&quot;:&quot;&quot;,&quot;ignore_headings_words&quot;:&quot;&quot;,&quot;enable_cache&quot;:&quot;no&quot;,&quot;highlight_current_heading&quot;:&quot;yes&quot;,&quot;hide_hidden_titles&quot;:&quot;no&quot;,&quot;limit_container&quot;:&quot;page_content&quot;,&quot;select_custom_headings&quot;:&quot;.contenu H2, .contenu H3&quot;,&quot;icon&quot;:&quot;fa-flag fas&quot;,&quot;counter_type&quot;:&quot;none&quot;}" style="--awb-item-padding-right:5px;--awb-item-padding-left:5px;"><div class="awb-toc-el__content"></div></div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:20px;margin-bottom:20px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-image-element " style="--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--h2_typography-font-family);--awb-caption-title-font-weight:var(--h2_typography-font-weight);--awb-caption-title-font-style:var(--h2_typography-font-style);--awb-caption-title-size:var(--h2_typography-font-size);--awb-caption-title-transform:var(--h2_typography-text-transform);--awb-caption-title-line-height:var(--h2_typography-line-height);--awb-caption-title-letter-spacing:var(--h2_typography-letter-spacing);--awb-filter:saturate(100%);--awb-filter-transition:filter 0.3s ease;--awb-filter-hover:saturate(0%);"><span class=" fusion-imageframe imageframe-none imageframe-4 hover-type-zoomout"><img decoding="async" width="1536" height="1024" src="https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3.png" alt class="img-responsive wp-image-1688" srcset="https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-200x133.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-400x267.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-600x400.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-800x533.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-1200x800.png 1200w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3.png 1536w" sizes="(max-width: 640px) 100vw, 400px" /></span></div></div></div></div></div>
<p>The post <a href="https://urbangeoanalytics.com/sagai-v2-multi-model-streetscape-analysis-uvlm/">SAGAI v2.0 — A Unified Multi-Model Notebook for Streetscape Analysis</a> appeared first on <a href="https://urbangeoanalytics.com">Urban Geo Analytics</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://urbangeoanalytics.com/sagai-v2-multi-model-streetscape-analysis-uvlm/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>UVLM v3.0.0: From Colab Notebook to Python Package — Run Vision-Language Models Anywhere</title>
		<link>https://urbangeoanalytics.com/uvlm-python-package-vision-language-models/</link>
					<comments>https://urbangeoanalytics.com/uvlm-python-package-vision-language-models/#respond</comments>
		
		<dc:creator><![CDATA[Joan Perez]]></dc:creator>
		<pubDate>Thu, 23 Apr 2026 07:25:41 +0000</pubDate>
				<category><![CDATA[Advanced]]></category>
		<category><![CDATA[Package]]></category>
		<category><![CDATA[Python]]></category>
		<category><![CDATA[Vision Language Model]]></category>
		<category><![CDATA[AI]]></category>
		<category><![CDATA[Google Colab]]></category>
		<category><![CDATA[Image Analysis]]></category>
		<category><![CDATA[Jupyter Notebook]]></category>
		<category><![CDATA[Llava]]></category>
		<category><![CDATA[Qwen]]></category>
		<category><![CDATA[UVLM]]></category>
		<guid isPermaLink="false">https://urbangeoanalytics.com/?p=2442</guid>

					<description><![CDATA[<p>UVLM v3.0.0 turns a Colab notebook into a full Python package. Run vision-language models locally, in notebooks, or scripts with a simple API and no setup complexity.</p>
<p>The post <a href="https://urbangeoanalytics.com/uvlm-python-package-vision-language-models/">UVLM v3.0.0: From Colab Notebook to Python Package — Run Vision-Language Models Anywhere</a> appeared first on <a href="https://urbangeoanalytics.com">Urban Geo Analytics</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="fusion-fullwidth fullwidth-box fusion-builder-row-2 fusion-flex-container has-pattern-background has-mask-background nonhundred-percent-fullwidth non-hundred-percent-height-scrolling" style="--awb-border-radius-top-left:0px;--awb-border-radius-top-right:0px;--awb-border-radius-bottom-right:0px;--awb-border-radius-bottom-left:0px;--awb-flex-wrap:wrap;" id="contenu" ><div class="fusion-builder-row fusion-row fusion-flex-align-items-flex-start fusion-flex-content-wrap" style="max-width:1248px;margin-left: calc(-4% / 2 );margin-right: calc(-4% / 2 );"><div class="fusion-layout-column fusion_builder_column fusion-builder-column-2 fusion_builder_column_3_4 3_4 fusion-flex-column" style="--awb-bg-size:cover;--awb-width-large:75%;--awb-margin-top-large:0px;--awb-spacing-right-large:2.56%;--awb-margin-bottom-large:20px;--awb-spacing-left-large:2.56%;--awb-width-medium:75%;--awb-order-medium:0;--awb-spacing-right-medium:2.56%;--awb-spacing-left-medium:2.56%;--awb-width-small:100%;--awb-order-small:0;--awb-spacing-right-small:1.92%;--awb-spacing-left-small:1.92%;" id="contenu" data-scroll-devices="small-visibility,medium-visibility,large-visibility"><div class="fusion-column-wrapper fusion-column-has-shadow fusion-flex-justify-content-flex-start fusion-content-layout-column"><div class="fusion-image-element awb-imageframe-style awb-imageframe-style-below awb-imageframe-style-5" style="text-align:center;--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--body_typography-font-family);--awb-caption-title-font-weight:var(--body_typography-font-weight);--awb-caption-title-font-style:var(--body_typography-font-style);--awb-caption-title-size:var(--body_typography-font-size);--awb-caption-title-transform:var(--body_typography-text-transform);--awb-caption-title-line-height:var(--body_typography-line-height);--awb-caption-title-letter-spacing:var(--body_typography-letter-spacing);"><span class=" fusion-imageframe imageframe-none imageframe-5 hover-type-none"><img decoding="async" width="1619" height="971" title="flag fig" src="https://urbangeoanalytics.com/wp-content/uploads/2026/04/flag-fig.png" alt class="img-responsive wp-image-2469" srcset="https://urbangeoanalytics.com/wp-content/uploads/2026/04/flag-fig-200x120.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/flag-fig-400x240.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/flag-fig-600x360.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/flag-fig-800x480.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/flag-fig-1200x720.png 1200w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/flag-fig.png 1619w" sizes="(max-width: 640px) 100vw, 1200px" /></span><div class="awb-imageframe-caption-container" style="text-align:center;"><div class="awb-imageframe-caption"><div class="awb-imageframe-caption-title"> </div></div></div></div><div class="fusion-text fusion-text-15"><h5><strong>Highlights</strong></h5>
</div><div class="fusion-text fusion-text-16" style="--awb-margin-top:-30px;"><ul>
<li><strong data-start="64" data-end="88">UVLM is now a pip-installable Python package </strong>— no longer tied to Google Colab</li>
<li><strong data-start="64" data-end="88">Run on your own GPU </strong>with a local Jupyter notebook, or keep using Colab for free</li>
<li><strong data-start="64" data-end="88">Same tool, more flexibility </strong>— three lines of Python to load a model and analyse images</li>
</ul>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-text fusion-text-17 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>When we released UVLM in March 2026, it was a Google Colab notebook. You opened it in your browser, picked a model, typed your prompts, and ran your images — all without installing anything. That simplicity was the point: a tool that anyone could use to load and compare Vision-Language Models, regardless of their technical setup.</p>
<p>But we kept hearing the same requests. Can I run this on my own machine? Can I call UVLM from a script? Can I integrate it into an existing pipeline? The answer was always the same: not easily. The entire tool lived inside a single notebook, with all the logic packed into three massive code cells. Moving it anywhere else meant copy-pasting thousands of lines and untangling global variables.</p>
<p>Version 3.0.0 changes that. UVLM is now a proper Python package.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-12 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">What Changed</h2></div><div class="fusion-text fusion-text-18 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>The core logic — model loading, dual-backend inference, response parsing, consensus validation, batch processing — has been extracted from the notebook into eight standalone Python modules. These modules have no dependency on Google Colab, no global variables, and no widget code. They are plain Python functions that accept arguments and return results.</p>
</div><div class="fusion-image-element awb-imageframe-style awb-imageframe-style-below awb-imageframe-style-6" style="text-align:center;--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--body_typography-font-family);--awb-caption-title-font-weight:var(--body_typography-font-weight);--awb-caption-title-font-style:var(--body_typography-font-style);--awb-caption-title-size:var(--body_typography-font-size);--awb-caption-title-transform:var(--body_typography-text-transform);--awb-caption-title-line-height:var(--body_typography-line-height);--awb-caption-title-letter-spacing:var(--body_typography-letter-spacing);"><span class=" fusion-imageframe imageframe-none imageframe-6 hover-type-none"><img decoding="async" width="2000" height="1162" title="UVLM package blogpost figure 1" src="https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-package-blogpost-figure-1-scaled.png" alt class="img-responsive wp-image-2444" srcset="https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-package-blogpost-figure-1-200x116.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-package-blogpost-figure-1-400x232.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-package-blogpost-figure-1-600x349.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-package-blogpost-figure-1-800x465.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-package-blogpost-figure-1-1200x697.png 1200w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-package-blogpost-figure-1-scaled.png 2000w" sizes="(max-width: 640px) 100vw, 1200px" /></span><div class="awb-imageframe-caption-container" style="text-align:center;"><div class="awb-imageframe-caption"><div class="awb-imageframe-caption-title"> </div></div></div></div><div class="fusion-text fusion-text-19 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>The package is installed from GitHub in one line:</p>
</div><div class="fusion-text fusion-text-20 fusion-text-no-margin" style="--awb-margin-top:1px;--awb-margin-bottom:25px;"><pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="dracula" data-enlighter-group="Python1" data-enlighter-title="Python">pip install git+https://github.com/perezjoan/UVLM.git</pre>
</div><div class="fusion-text fusion-text-21 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:5px;--awb-margin-bottom:25px;"><p>On Google Colab, this happens automatically in the first cell of the Colab notebook. On your local machine, you run it once in a terminal and you are done.</p>
<p>Nothing changed in how UVLM analyses images. The same 11 model checkpoints are supported (LLaVA-NeXT and Qwen2.5-VL, from 3B to 110B parameters). The same parsing logic, the same consensus validation, the same truncation detection. If you had a workflow built on v2.2.2, the outputs will be identical.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-13 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">Three Ways to Use UVLM</h2></div><div class="fusion-text fusion-text-22 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p><strong>Google Colab — Zero Install</strong></p>
<p>This is the same experience as before. Open the Colab notebook, select a GPU runtime, and start working. The notebook installs the UVLM package automatically. Images are loaded from Google Drive. Nothing has changed for Colab users, except that the code running behind the widgets is now cleaner and easier to maintain.</p>
<p><strong>Local Jupyter Notebook — Your GPU, Your Data</strong></p>
<p>If you have an NVIDIA GPU on your workstation (or access to a GPU server), you can now run UVLM locally. The local Jupyter notebook provides the same widget-based interface — model selection dropdown, prompt builder form, batch execution button — but images are read from your local filesystem and results are saved locally. No Google account needed, no data leaves your machine.</p>
<p>This matters for researchers working with sensitive imagery (medical, security, proprietary datasets) or for anyone who wants faster and more reliable model loading than what Colab&#8217;s network provides.</p>
<p><strong>Python Script — Full Programmatic Control</strong></p>
<p>For integration into larger pipelines, UVLM now exposes a clean API. Three lines of code replace the entire notebook workflow:</p>
</div><div class="fusion-text fusion-text-23 fusion-text-no-margin" style="--awb-margin-top:1px;--awb-margin-bottom:25px;"><pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="dracula" data-enlighter-group="Python2" data-enlighter-title="Python">from uvlm import load_model, run_inference, parse_response
ctx = load_model("[Qwen] Qwen2.5-VL 7B Instruct", precision="4bit")
raw, tokens = run_inference("photo.jpg", "Count the cars", ctx)
result = parse_response(raw, "numeric")</pre>
</div><div class="fusion-text fusion-text-24 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:5px;--awb-margin-bottom:25px;"><p>The `load_model()` function returns a context dictionary containing the model, processor, backend type, and device information. This dictionary is passed to every subsequent function — no global state, no hidden side effects. You can load multiple models in the same session and switch between them by passing different context objects.</p>
<p>For batch processing, `run_batch()` handles the full pipeline:</p>
</div><div class="fusion-text fusion-text-25 fusion-text-no-margin" style="--awb-margin-top:1px;--awb-margin-bottom:25px;"><pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="dracula" data-enlighter-group="Python3" data-enlighter-title="Python">from uvlm import load_model
from uvlm.batch import run_batch

ctx = load_model("[Qwen]  Qwen2.5-VL 7B Instruct", precision="4bit")
df = run_batch(
    model_ctx=ctx,
    task_specs=my_tasks,
    image_folder="./images",
    output_path="./results.csv",
)
</pre>
</div><div class="fusion-image-element awb-imageframe-style awb-imageframe-style-below awb-imageframe-style-7" style="text-align:center;--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--body_typography-font-family);--awb-caption-title-font-weight:var(--body_typography-font-weight);--awb-caption-title-font-style:var(--body_typography-font-style);--awb-caption-title-size:var(--body_typography-font-size);--awb-caption-title-transform:var(--body_typography-text-transform);--awb-caption-title-line-height:var(--body_typography-line-height);--awb-caption-title-letter-spacing:var(--body_typography-letter-spacing);"><span class=" fusion-imageframe imageframe-none imageframe-7 hover-type-none"><img decoding="async" width="2000" height="926" title="UVLM deploy blogpost figure 2" src="https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-deploy-blogpost-figure-2-scaled.png" alt class="img-responsive wp-image-2457" srcset="https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-deploy-blogpost-figure-2-200x93.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-deploy-blogpost-figure-2-400x185.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-deploy-blogpost-figure-2-600x278.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-deploy-blogpost-figure-2-800x370.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-deploy-blogpost-figure-2-1200x556.png 1200w, https://urbangeoanalytics.com/wp-content/uploads/2026/04/UVLM-deploy-blogpost-figure-2-scaled.png 2000w" sizes="(max-width: 640px) 100vw, 1200px" /></span><div class="awb-imageframe-caption-container" style="text-align:center;"><div class="awb-imageframe-caption"><div class="awb-imageframe-caption-title"> </div><p class="awb-imageframe-caption-text"> </p></div></div></div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-14 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">Under the Hood: Package Structure</h2></div><div class="fusion-text fusion-text-26 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>The monolithic notebook has been split into eight modules, each with a single responsibility:</p>
<p><em>registry.py</em> holds the model dictionary — 11 checkpoints with their backend type and <strong>HuggingFace checkpoint ID</strong>. Adding a new model is one line in a dictionary.</p>
<p><em>loader.py</em> contains the `load_model()` function. It handles quantisation configuration (4-bit, 8-bit, FP16), device placement (single GPU, auto, CPU offload), and the LLaVA vs Qwen branching logic. It returns a dictionary — not a set of global variables.</p>
<p><em>inference.py</em> contains `run_inference()`, the dual-backend forward pass. It accepts a model context dictionary and returns the raw response plus the exact token count as a tuple. The full LLaVA response cleaning logic and the full Qwen token-trimming pipeline are preserved exactly as they were.</p>
<p><em>parsers.py</em> holds the four response parsers (numeric, category, boolean, text) and the advanced reasoning parser. These are pure functions with zero dependencies beyond Python&#8217;s standard library.</p>
<p><em>consensus.py</em> contains the majority voting logic. <em>batch.py</em> handles folder iteration, CSV writing, resume mode, and schema upgrading. <em>prompts.py</em> stores the task type definitions and the chain-of-thought templates. <em>utils.py</em> provides seed management, environment detection, and <strong>HuggingFace token</strong> retrieval.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-15 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">Getting Started</h2></div><div class="fusion-text fusion-text-27 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p><strong>On Colab</strong>: Open the notebook from GitHub and run the three blocks as before. The package installs itself.</p>
<p><strong>Locally</strong>: First, install PyTorch with CUDA support matching your GPU driver (check with `nvidia-smi`). For example, with CUDA 12.8+:</p>
</div><div class="fusion-text fusion-text-28 fusion-text-no-margin" style="--awb-margin-top:1px;--awb-margin-bottom:25px;"><pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="dracula" data-enlighter-group="Python4" data-enlighter-title="Python">pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128
pip install git+https://github.com/perezjoan/UVLM.git
</pre>
</div><div class="fusion-text fusion-text-29 fusion-text-no-margin" style="--awb-margin-top:1px;--awb-margin-bottom:25px;"><pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="dracula" data-enlighter-group="Python4" data-enlighter-title="Python">pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128
pip install git+https://github.com/perezjoan/UVLM.git
</pre>
</div><div class="fusion-text fusion-text-30 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:5px;--awb-margin-bottom:25px;"><p>Then open the local Jupyter notebook.</p>
<p>You get the same dropdown menus, the same prompt builder form, the same batch execution. The only difference is that you type a local path for your image folder instead of a Google Drive path.</p>
<p>For HuggingFace authentication (needed for some gated models like LLaMA3-based checkpoints), either set the `HF_TOKEN` environment variable or run `huggingface-cli login` once in your terminal.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-16 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">What Is Next</h2></div><div class="fusion-text fusion-text-31 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>The package architecture makes it much easier to add new VLM families. InternVL, BLIP-2, CogVLM, DeepSeek-VL, and Molmo are planned for future releases — each one requires implementing the backend-specific sections of the inference function and adding entries to the registry, without touching the rest of the codebase.</p>
<p>We are also working on multi-GPU batching for parallel inference across images, video frame analysis support, and integration with the SAGAI workflow for automated streetscape analysis.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-17 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">Links</h2></div><div class="fusion-text fusion-text-32 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>Source code: <a class="keychainify-checked" href="https://github.com/perezjoan/UVLM">github.com/perezjoan/UVLM</a></p>
<p>Paper: <a class="keychainify-checked" href="https://arxiv.org/abs/2603.13893">arXiv preprint</a> — Perez &amp; Fusco (2026)</p>
<p>UVLM page on this site: urbangeoanalytics.com › Software &amp; Algorithms › <a class="keychainify-checked" href="https://urbangeoanalytics.com/algorithms-softwares/uvlm-universal-vision-language-model-loader/">UVLM</a></p>
<p>Previous blog post: <a class="keychainify-checked" href="https://urbangeoanalytics.com/introducing-uvlm-free-tool-compare-ai-vision-language-models/">Introducing UVLM: A Free Tool to Compare AI Models That Understand Images</a></p>
</div><div class="fusion-title title fusion-title-18 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">Citation</h2></div><div class="fusion-text fusion-text-33 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>If you use UVLM in your work, please cite:</p>
<p>Perez, J. &amp; Fusco, G. (2026). <em>UVLM: A Universal Vision-Language Model Loader for Reproducible Multimodal Benchmarking.</em> arXiv:2603.13893</p>
</div></div></div><div class="fusion-layout-column fusion_builder_column fusion-builder-column-3 awb-sticky awb-sticky-medium awb-sticky-large fusion_builder_column_1_4 1_4 fusion-flex-column" style="--awb-padding-top:20px;--awb-padding-right:20px;--awb-padding-bottom:20px;--awb-padding-left:20px;--awb-bg-size:cover;--awb-border-color:var(--awb-color6);--awb-border-style:solid;--awb-width-large:25%;--awb-margin-top-large:0px;--awb-spacing-right-large:7.68%;--awb-margin-bottom-large:20px;--awb-spacing-left-large:7.68%;--awb-width-medium:25%;--awb-order-medium:0;--awb-spacing-right-medium:7.68%;--awb-spacing-left-medium:7.68%;--awb-width-small:100%;--awb-order-small:0;--awb-spacing-right-small:1.92%;--awb-spacing-left-small:1.92%;--awb-sticky-offset:150px;" data-scroll-devices="small-visibility,medium-visibility,large-visibility"><div class="fusion-column-wrapper fusion-column-has-shadow fusion-flex-justify-content-flex-start fusion-content-layout-column"><div class="fusion-text fusion-text-34"><p><span style="color: #143c4e;"><strong>Table of contents</strong></span></p>
</div><div class="awb-toc-el awb-toc-el--2" data-awb-toc-id="2" data-awb-toc-options="{&quot;allowed_heading_tags&quot;:{&quot;h2&quot;:0},&quot;ignore_headings&quot;:&quot;&quot;,&quot;ignore_headings_words&quot;:&quot;&quot;,&quot;enable_cache&quot;:&quot;no&quot;,&quot;highlight_current_heading&quot;:&quot;yes&quot;,&quot;hide_hidden_titles&quot;:&quot;no&quot;,&quot;limit_container&quot;:&quot;page_content&quot;,&quot;select_custom_headings&quot;:&quot;.contenu H2, .contenu H3&quot;,&quot;icon&quot;:&quot;fa-flag fas&quot;,&quot;counter_type&quot;:&quot;none&quot;}" style="--awb-item-padding-right:5px;--awb-item-padding-left:5px;"><div class="awb-toc-el__content"></div></div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:20px;margin-bottom:20px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-image-element " style="--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--h2_typography-font-family);--awb-caption-title-font-weight:var(--h2_typography-font-weight);--awb-caption-title-font-style:var(--h2_typography-font-style);--awb-caption-title-size:var(--h2_typography-font-size);--awb-caption-title-transform:var(--h2_typography-text-transform);--awb-caption-title-line-height:var(--h2_typography-line-height);--awb-caption-title-letter-spacing:var(--h2_typography-letter-spacing);--awb-filter:saturate(100%);--awb-filter-transition:filter 0.3s ease;--awb-filter-hover:saturate(0%);"><span class=" fusion-imageframe imageframe-none imageframe-8 hover-type-zoomout"><img decoding="async" width="1536" height="1024" src="https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3.png" alt class="img-responsive wp-image-1688" srcset="https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-200x133.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-400x267.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-600x400.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-800x533.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3-1200x800.png 1200w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/blog-lvl3.png 1536w" sizes="(max-width: 640px) 100vw, 400px" /></span></div></div></div></div></div>
<p>The post <a href="https://urbangeoanalytics.com/uvlm-python-package-vision-language-models/">UVLM v3.0.0: From Colab Notebook to Python Package — Run Vision-Language Models Anywhere</a> appeared first on <a href="https://urbangeoanalytics.com">Urban Geo Analytics</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://urbangeoanalytics.com/uvlm-python-package-vision-language-models/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>Introducing UVLM: A Free Tool to Compare AI Models That Understand Images</title>
		<link>https://urbangeoanalytics.com/introducing-uvlm-free-tool-compare-ai-vision-language-models/</link>
					<comments>https://urbangeoanalytics.com/introducing-uvlm-free-tool-compare-ai-vision-language-models/#respond</comments>
		
		<dc:creator><![CDATA[Joan Perez]]></dc:creator>
		<pubDate>Tue, 17 Mar 2026 14:23:58 +0000</pubDate>
				<category><![CDATA[Intermediate]]></category>
		<category><![CDATA[Python]]></category>
		<category><![CDATA[Vision Language Model]]></category>
		<category><![CDATA[Benchmarking]]></category>
		<category><![CDATA[Chain-of-Thought]]></category>
		<category><![CDATA[Google Colab]]></category>
		<category><![CDATA[Image Analysis]]></category>
		<category><![CDATA[Llava]]></category>
		<category><![CDATA[Multimodal AI]]></category>
		<category><![CDATA[Open Source]]></category>
		<category><![CDATA[Qwen]]></category>
		<category><![CDATA[UVLM]]></category>
		<category><![CDATA[VLM]]></category>
		<guid isPermaLink="false">https://urbangeoanalytics.com/?p=2356</guid>

					<description><![CDATA[<p>UVLM is a free, open-source tool for loading, testing, and comparing Vision-Language Models on custom image analysis tasks. Running entirely in Google Colab, it lets researchers and practitioners benchmark multiple AI models using the same prompts and images — no coding, no GPU ownership, no model-specific pipelines. This post explains what VLMs are, why comparing them matters, and how to get started in five minutes.</p>
<p>The post <a href="https://urbangeoanalytics.com/introducing-uvlm-free-tool-compare-ai-vision-language-models/">Introducing UVLM: A Free Tool to Compare AI Models That Understand Images</a> appeared first on <a href="https://urbangeoanalytics.com">Urban Geo Analytics</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="fusion-fullwidth fullwidth-box fusion-builder-row-3 fusion-flex-container has-pattern-background has-mask-background nonhundred-percent-fullwidth non-hundred-percent-height-scrolling" style="--awb-border-radius-top-left:0px;--awb-border-radius-top-right:0px;--awb-border-radius-bottom-right:0px;--awb-border-radius-bottom-left:0px;--awb-flex-wrap:wrap;" id="contenu" ><div class="fusion-builder-row fusion-row fusion-flex-align-items-flex-start fusion-flex-content-wrap" style="max-width:1248px;margin-left: calc(-4% / 2 );margin-right: calc(-4% / 2 );"><div class="fusion-layout-column fusion_builder_column fusion-builder-column-4 fusion_builder_column_3_4 3_4 fusion-flex-column" style="--awb-bg-size:cover;--awb-width-large:75%;--awb-margin-top-large:0px;--awb-spacing-right-large:2.56%;--awb-margin-bottom-large:20px;--awb-spacing-left-large:2.56%;--awb-width-medium:75%;--awb-order-medium:0;--awb-spacing-right-medium:2.56%;--awb-spacing-left-medium:2.56%;--awb-width-small:100%;--awb-order-small:0;--awb-spacing-right-small:1.92%;--awb-spacing-left-small:1.92%;" id="contenu" data-scroll-devices="small-visibility,medium-visibility,large-visibility"><div class="fusion-column-wrapper fusion-column-has-shadow fusion-flex-justify-content-flex-start fusion-content-layout-column"><div class="fusion-image-element awb-imageframe-style awb-imageframe-style-below awb-imageframe-style-9" style="text-align:center;--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--body_typography-font-family);--awb-caption-title-font-weight:var(--body_typography-font-weight);--awb-caption-title-font-style:var(--body_typography-font-style);--awb-caption-title-size:var(--body_typography-font-size);--awb-caption-title-transform:var(--body_typography-text-transform);--awb-caption-title-line-height:var(--body_typography-line-height);--awb-caption-title-letter-spacing:var(--body_typography-letter-spacing);"><span class=" fusion-imageframe imageframe-none imageframe-9 hover-type-none"><img decoding="async" width="1536" height="595" title="uvlm" src="https://urbangeoanalytics.com/wp-content/uploads/2026/03/uvlm.png" alt class="img-responsive wp-image-2342" srcset="https://urbangeoanalytics.com/wp-content/uploads/2026/03/uvlm-200x77.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/uvlm-400x155.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/uvlm-600x232.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/uvlm-800x310.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/uvlm-1200x465.png 1200w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/uvlm.png 1536w" sizes="(max-width: 640px) 100vw, 1200px" /></span><div class="awb-imageframe-caption-container" style="text-align:center;"><div class="awb-imageframe-caption"><div class="awb-imageframe-caption-title">uvlm</div></div></div></div><div class="fusion-text fusion-text-35"><h5><strong>Highlights</strong></h5>
</div><div class="fusion-text fusion-text-36" style="--awb-margin-top:-30px;"><ul>
<li><strong>New open-source release: UVLM v2.2.2</strong> — compare Vision-Language Models from a single notebook</li>
<li><strong>11 AI models</strong>, 5 analysis tasks, 120 test images — all benchmarked with one tool</li>
<li><strong>No coding, no installation</strong> — runs in Google Colab with a free account</li>
</ul>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-text fusion-text-37 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>Imagine you have thousands of street photographs and you need to answer the same questions about each one: how many cars are parked? Is there a sidewalk? How long is the building frontage? Hiring someone to go through every image manually would take weeks. Training a custom computer vision model would take months. But what if you could simply ask an AI model these questions in plain English — and get structured, usable answers back?</p>
<p>That is exactly what Vision-Language Models do. And today, we are releasing UVLM — an open-source tool that makes it easy to load, test, and compare these models, all from a single notebook in your browser.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-19 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">What Are Vision-Language Models?</h2></div><div class="fusion-text fusion-text-38 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>Vision-Language Models (VLMs) are AI systems that can look at an image and answer questions about it in natural language. Unlike traditional computer vision, which requires training a separate model for every task (one for counting cars, another for detecting sidewalks, a third for classifying buildings), a VLM handles all of these through text prompts. You write a question, attach a photo, and the model responds.</p>
<p>For example, you can ask a VLM: “Count all motor vehicles visible in this image” and it will answer “3”. You can ask the same model “Is there a sidewalk along the street frontage?” and it will answer “yes”. You can even ask it to estimate the length of a building facade in meters — a task that requires the model to identify reference objects (like parked cars), estimate their size, and reason about perspective. All of this from a single model, with no retraining and no labelled dataset.</p>
<p>The catch is that there are many VLM families available (LLaVA, Qwen, InternVL, BLIP-2, and more), and each one works differently under the hood. They use different image encoders, different tokenisation strategies, and different code to run. If you want to know which model is best for your specific task, you normally have to write separate code for each one — a tedious and error-prone process.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-20 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">This Is the Problem UVLM Solves</h2></div><div class="fusion-text fusion-text-39 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>UVLM (Universal Vision-Language Model Loader) is a free, open-source tool that lets you load, configure, and compare multiple VLM architectures using the same prompts and the same evaluation protocol — without writing any model-specific code. It runs entirely in Google Colab, which means you do not need to install anything on your computer or own a GPU. A free Google account is all you need.</p>
<p>The idea is simple: you pick a model from a dropdown menu, type your analysis questions into a form, point the tool at a folder of images, and hit run. UVLM handles all the technical details — the processor classes, the tokenisation, the generation settings, the output parsing — and delivers a clean CSV file with one row per image and one column per task. If you want to try a different model, you just switch the dropdown and run again. Same prompts, same images, same output format. Now you can compare.</p>
</div><div class="fusion-image-element awb-imageframe-style awb-imageframe-style-below awb-imageframe-style-10" style="text-align:center;--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--body_typography-font-family);--awb-caption-title-font-weight:var(--body_typography-font-weight);--awb-caption-title-font-style:var(--body_typography-font-style);--awb-caption-title-size:var(--body_typography-font-size);--awb-caption-title-transform:var(--body_typography-text-transform);--awb-caption-title-line-height:var(--body_typography-line-height);--awb-caption-title-letter-spacing:var(--body_typography-letter-spacing);"><span class=" fusion-imageframe imageframe-none imageframe-10 hover-type-none"><img decoding="async" width="1190" height="823" title="image1" src="https://urbangeoanalytics.com/wp-content/uploads/2026/03/image1.png" alt class="img-responsive wp-image-2319" srcset="https://urbangeoanalytics.com/wp-content/uploads/2026/03/image1-200x138.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/image1-400x277.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/image1-600x415.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/image1-800x553.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2026/03/image1.png 1190w" sizes="(max-width: 640px) 100vw, 1190px" /></span><div class="awb-imageframe-caption-container" style="text-align:center;"><div class="awb-imageframe-caption"><div class="awb-imageframe-caption-title">The 3 blocks structure of UVLM Loader</div></div></div></div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-21 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">A Practical Example: Scoring 120 Street Photographs</h2></div><div class="fusion-text fusion-text-40 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>To demonstrate what UVLM can do, we benchmarked 8 different models on 120 street-level photographs of French urban frontages. Each image was analysed on five tasks: counting vehicles, detecting sidewalks, counting pedestrian entrances, estimating the street frontage length in meters, and classifying the vegetation type. That is 16 model configurations (each model tested in standard and advanced reasoning modes), 120 images, and 5 tasks per image — all processed and compared through UVLM.</p>
<p>The results were revealing. The largest model (LLaVA 34B, with 34 billion parameters) actually ranked last overall. A much smaller model (LLaVA Vicuna 7B) outperformed it significantly and ran on a free Google Colab GPU. The best overall results came from Qwen 32B with chain-of-thought reasoning enabled, which achieved 88% proximity to human expert annotations across all five tasks. Without UVLM, discovering these differences would have required writing and debugging eight separate inference pipelines.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-22 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">Who Is UVLM For?</h2></div><div class="fusion-text fusion-text-41 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>UVLM was designed for anyone who works with images and wants to extract structured information from them at scale — without becoming a machine learning engineer. If you are an urban planner evaluating streetscape quality across a city, UVLM lets you score thousands of street photographs using natural language prompts. If you are an environmental researcher classifying vegetation from field photographs, UVLM lets you test which AI model gives the most reliable results for your specific classification scheme. If you are an infrastructure inspector processing damage assessment photographs, UVLM lets you set up automated counting and scoring tasks and run them across your entire image archive.</p>
<p>The tool is also valuable for AI researchers who need a controlled benchmarking environment. Because UVLM ensures that every model receives exactly the same prompt and is evaluated with the same metrics, it produces fair, reproducible comparisons. The consensus validation feature (running each task multiple times and taking a majority vote) addresses the inherent randomness of AI outputs, and the truncation detection feature flags when a model’s response was cut off before it could finish — a common but often invisible source of errors.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-23 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">How to Get Started</h2></div><div class="fusion-text fusion-text-42 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>Getting started takes about five minutes. Open the UVLM notebook from GitHub (the link is below), connect to a GPU runtime in Google Colab, and run the first block to load a model. The second block gives you a form where you type your analysis questions — no coding required. The third block processes your images and saves the results as a CSV file on your Google Drive.</p>
<p>The tool currently supports 11 model checkpoints from two major families (LLaVA-NeXT and Qwen2.5-VL), ranging from 3 billion to 110 billion parameters. Models up to 34B can run on a single free-tier Colab GPU with 4-bit quantisation. Advanced features include consensus validation (2–5 runs per task with majority voting), chain-of-thought reasoning for complex tasks, and automatic truncation detection.</p>
<p>UVLM is released under the Apache 2.0 open-source licence. You can use it, modify it, and build on it for any purpose — academic or commercial.</p>
</div><div class="fusion-text fusion-text-43 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2>Links</h2>
<p><strong>Source code: </strong><a class="keychainify-checked" href="https://github.com/perezjoan/UVLM">github.com/perezjoan/UVLM</a></p>
<p><strong>Paper: </strong><a class="keychainify-checked" href="https://arxiv.org/abs/2603.13893">arXiv preprint — Perez &amp; Fusco (2026)</a></p>
<p><strong>UVLM page on this site: </strong><a class="keychainify-checked" href="https://urbangeoanalytics.com/algorithms-softwares/uvlm-universal-vision-language-model-loader/">urbangeoanalytics.com › Softwares &amp; Algorithms › UVLM</a></p>
<p><strong>Benchmark dataset: </strong><a class="keychainify-checked" href="https://zenodo.org/records/18959690">Zenodo — 120 street-view images</a></p>
<h2>Citation</h2>
<p>If you use UVLM in your work, please cite:</p>
<p><em>Perez, J. &amp; Fusco, G. (2026). UVLM: A Universal Vision-Language Model Loader for Reproducible Multimodal Benchmarking. arXiv:2603.13893</em></p>
</div></div></div><div class="fusion-layout-column fusion_builder_column fusion-builder-column-5 awb-sticky awb-sticky-medium awb-sticky-large fusion_builder_column_1_4 1_4 fusion-flex-column" style="--awb-padding-top:20px;--awb-padding-right:20px;--awb-padding-bottom:20px;--awb-padding-left:20px;--awb-bg-size:cover;--awb-border-color:var(--awb-color6);--awb-border-style:solid;--awb-width-large:25%;--awb-margin-top-large:0px;--awb-spacing-right-large:7.68%;--awb-margin-bottom-large:20px;--awb-spacing-left-large:7.68%;--awb-width-medium:25%;--awb-order-medium:0;--awb-spacing-right-medium:7.68%;--awb-spacing-left-medium:7.68%;--awb-width-small:100%;--awb-order-small:0;--awb-spacing-right-small:1.92%;--awb-spacing-left-small:1.92%;--awb-sticky-offset:150px;" data-scroll-devices="small-visibility,medium-visibility,large-visibility"><div class="fusion-column-wrapper fusion-column-has-shadow fusion-flex-justify-content-flex-start fusion-content-layout-column"><div class="fusion-text fusion-text-44"><p><span style="color: #143c4e;"><strong>Table of contents</strong></span></p>
</div><div class="awb-toc-el awb-toc-el--3" data-awb-toc-id="3" data-awb-toc-options="{&quot;allowed_heading_tags&quot;:{&quot;h2&quot;:0},&quot;ignore_headings&quot;:&quot;&quot;,&quot;ignore_headings_words&quot;:&quot;&quot;,&quot;enable_cache&quot;:&quot;no&quot;,&quot;highlight_current_heading&quot;:&quot;yes&quot;,&quot;hide_hidden_titles&quot;:&quot;no&quot;,&quot;limit_container&quot;:&quot;page_content&quot;,&quot;select_custom_headings&quot;:&quot;.contenu H2, .contenu H3&quot;,&quot;icon&quot;:&quot;fa-flag fas&quot;,&quot;counter_type&quot;:&quot;none&quot;}" style="--awb-item-padding-right:5px;--awb-item-padding-left:5px;"><div class="awb-toc-el__content"></div></div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:20px;margin-bottom:20px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-image-element " style="--awb-margin-top:25px;--awb-margin-bottom:25px;--awb-caption-title-font-family:var(--h2_typography-font-family);--awb-caption-title-font-weight:var(--h2_typography-font-weight);--awb-caption-title-font-style:var(--h2_typography-font-style);--awb-caption-title-size:var(--h2_typography-font-size);--awb-caption-title-transform:var(--h2_typography-text-transform);--awb-caption-title-line-height:var(--h2_typography-line-height);--awb-caption-title-letter-spacing:var(--h2_typography-letter-spacing);--awb-filter:saturate(100%);--awb-filter-transition:filter 0.3s ease;--awb-filter-hover:saturate(0%);"><span class=" fusion-imageframe imageframe-none imageframe-11 hover-type-zoomout"><img decoding="async" width="1536" height="1024" title="blog lvl2" src="https://urbangeoanalytics.com/wp-content/uploads/2025/11/ChatGPT-Image-7-nov.-2025-09_10_15.png" alt class="img-responsive wp-image-1687" srcset="https://urbangeoanalytics.com/wp-content/uploads/2025/11/ChatGPT-Image-7-nov.-2025-09_10_15-200x133.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/ChatGPT-Image-7-nov.-2025-09_10_15-400x267.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/ChatGPT-Image-7-nov.-2025-09_10_15-600x400.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/ChatGPT-Image-7-nov.-2025-09_10_15-800x533.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/ChatGPT-Image-7-nov.-2025-09_10_15-1200x800.png 1200w, https://urbangeoanalytics.com/wp-content/uploads/2025/11/ChatGPT-Image-7-nov.-2025-09_10_15.png 1536w" sizes="(max-width: 640px) 100vw, 400px" /></span></div></div></div></div></div>
<p>The post <a href="https://urbangeoanalytics.com/introducing-uvlm-free-tool-compare-ai-vision-language-models/">Introducing UVLM: A Free Tool to Compare AI Models That Understand Images</a> appeared first on <a href="https://urbangeoanalytics.com">Urban Geo Analytics</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://urbangeoanalytics.com/introducing-uvlm-free-tool-compare-ai-vision-language-models/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>A Stable and Reproducible Vision–Language Inference Engine for SAGAI v1.1</title>
		<link>https://urbangeoanalytics.com/a-stable-and-reproducible-vision-language-inference-engine-for-sagai-v1-1/</link>
					<comments>https://urbangeoanalytics.com/a-stable-and-reproducible-vision-language-inference-engine-for-sagai-v1-1/#respond</comments>
		
		<dc:creator><![CDATA[Joan Perez]]></dc:creator>
		<pubDate>Wed, 17 Dec 2025 17:03:56 +0000</pubDate>
				<category><![CDATA[Python]]></category>
		<category><![CDATA[Urbanism]]></category>
		<category><![CDATA[Vision Language Model]]></category>
		<category><![CDATA[AI]]></category>
		<category><![CDATA[Cloud Computing]]></category>
		<category><![CDATA[GIS]]></category>
		<category><![CDATA[Llava]]></category>
		<category><![CDATA[Spatial Analysis]]></category>
		<guid isPermaLink="false">https://urbangeoanalytics.com/?p=2275</guid>

					<description><![CDATA[<p>SAGAI v1.1 introduces Module 3 v2.0, a stable and reproducible vision–language inference engine for streetscape analysis. Built exclusively on Hugging Face LLaVA models, it enables robust multimodal processing of street-level images for large-scale urban and geospatial analysis.</p>
<p>The post <a href="https://urbangeoanalytics.com/a-stable-and-reproducible-vision-language-inference-engine-for-sagai-v1-1/">A Stable and Reproducible Vision–Language Inference Engine for SAGAI v1.1</a> appeared first on <a href="https://urbangeoanalytics.com">Urban Geo Analytics</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="fusion-fullwidth fullwidth-box fusion-builder-row-4 fusion-flex-container has-pattern-background has-mask-background nonhundred-percent-fullwidth non-hundred-percent-height-scrolling" style="--awb-border-radius-top-left:0px;--awb-border-radius-top-right:0px;--awb-border-radius-bottom-right:0px;--awb-border-radius-bottom-left:0px;--awb-flex-wrap:wrap;" id="contenu" ><div class="fusion-builder-row fusion-row fusion-flex-align-items-flex-start fusion-flex-content-wrap" style="max-width:1248px;margin-left: calc(-4% / 2 );margin-right: calc(-4% / 2 );"><div class="fusion-layout-column fusion_builder_column fusion-builder-column-6 fusion_builder_column_3_4 3_4 fusion-flex-column" style="--awb-bg-size:cover;--awb-width-large:75%;--awb-margin-top-large:0px;--awb-spacing-right-large:2.56%;--awb-margin-bottom-large:20px;--awb-spacing-left-large:2.56%;--awb-width-medium:75%;--awb-order-medium:0;--awb-spacing-right-medium:2.56%;--awb-spacing-left-medium:2.56%;--awb-width-small:100%;--awb-order-small:0;--awb-spacing-right-small:1.92%;--awb-spacing-left-small:1.92%;" id="contenu" data-scroll-devices="small-visibility,medium-visibility,large-visibility"><div class="fusion-column-wrapper fusion-column-has-shadow fusion-flex-justify-content-flex-start fusion-content-layout-column"><div class="fusion-image-element " style="--awb-caption-title-font-family:var(--h2_typography-font-family);--awb-caption-title-font-weight:var(--h2_typography-font-weight);--awb-caption-title-font-style:var(--h2_typography-font-style);--awb-caption-title-size:var(--h2_typography-font-size);--awb-caption-title-transform:var(--h2_typography-text-transform);--awb-caption-title-line-height:var(--h2_typography-line-height);--awb-caption-title-letter-spacing:var(--h2_typography-letter-spacing);"><span class=" fusion-imageframe imageframe-none imageframe-12 hover-type-none"><img decoding="async" width="1536" height="1024" title="Sagai 1.1" src="https://urbangeoanalytics.com/wp-content/uploads/2025/12/Sagai-1.1.png" alt class="img-responsive wp-image-2278" srcset="https://urbangeoanalytics.com/wp-content/uploads/2025/12/Sagai-1.1-200x133.png 200w, https://urbangeoanalytics.com/wp-content/uploads/2025/12/Sagai-1.1-400x267.png 400w, https://urbangeoanalytics.com/wp-content/uploads/2025/12/Sagai-1.1-600x400.png 600w, https://urbangeoanalytics.com/wp-content/uploads/2025/12/Sagai-1.1-800x533.png 800w, https://urbangeoanalytics.com/wp-content/uploads/2025/12/Sagai-1.1-1200x800.png 1200w, https://urbangeoanalytics.com/wp-content/uploads/2025/12/Sagai-1.1.png 1536w" sizes="(max-width: 640px) 100vw, 1200px" /></span></div><div class="fusion-text fusion-text-45"><h5><strong>Highlights</strong></h5>
</div><div class="fusion-text fusion-text-46" style="--awb-margin-top:-30px;"><ul>
<li><strong data-start="142" data-end="159">Module 3 v2.0</strong> is the refactored inference engine of <strong data-start="198" data-end="212" data-is-only-node="">SAGAI v1.1</strong>, designed for stable and reproducible vision–language analysis of streetscape images</li>
<li>The new architecture relies <strong data-start="329" data-end="389">exclusively on Hugging Face–native LLaVA models and APIs</strong>, removing dependencies on research codebases.</li>
<li>Multimodal prompting, image–text alignment, and inference are handled through <strong data-start="516" data-end="555">standardized Transformers workflows</strong>, ensuring long-term compatibility.</li>
</ul>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-24 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">Introduction</h2></div><div class="fusion-text fusion-text-47 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p data-start="438" data-end="784">Module 3 is the inference core of the <strong data-start="476" data-end="548">SAGAI (Streetscape Analysis with Generative Artificial Intelligence)</strong> framework. Its role is to transform large collections of street-level images into <strong data-start="631" data-end="667">structured, quantitative outputs</strong> using vision–language models (VLMs), enabling systematic streetscape analysis and subsequent geospatial aggregation.</p>
<p data-start="786" data-end="1114">With <strong data-start="791" data-end="805">SAGAI v1.1</strong>, Module 3 has been released in a new major version (<strong data-start="858" data-end="875">Module 3 v2.0</strong>) that introduces a fully standardized and maintenance-safe inference architecture. This update reflects both the maturation of multimodal model ecosystems and the need for long-term reproducibility in large-scale urban analysis pipelines.</p>
<p data-start="1116" data-end="1480">Earlier iterations of Module 3 were developed during a period of rapid evolution in both LLaVA research codebases and execution environments such as Google Colab. As multimodal models transitioned toward <strong data-start="1320" data-end="1388">Transformers-native implementations distributed via Hugging Face</strong>, assumptions embedded in earlier hybrid workflows became increasingly difficult to sustain.</p>
<p data-start="1482" data-end="1811">Module 3 v2.0 addresses this evolution by aligning the entire inference pipeline with <strong data-start="1568" data-end="1609">official Hugging Face multimodal APIs</strong>. Model loading, prompt formatting, image–text fusion, and generation are now handled through maintained and versioned components, ensuring compatibility across environments, models, and future updates.</p>
<p data-start="1813" data-end="2040">This document details the architectural context motivating the update, the design choices behind the refactored inference engine, and the rationale for releasing Module 3 v2.0 as a long-term, stable component of <strong data-start="2025" data-end="2039">SAGAI v1.1</strong>.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-25 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">1. Architectural Context of Module 3 in the Previous version: SAGAI v1.0</h2></div><div class="fusion-text fusion-text-48 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p>The initial implementation of Module 3 (SAGAI v1.0) relied on a <strong data-start="280" data-end="353">hybrid architecture that mixed two incompatible sources of LLaVA code</strong>, combined with a rapidly evolving execution environment in Google Colab. This design choice made the pipeline fragile and ultimately unsustainable.</p>
<p data-start="503" data-end="1003">First, the pipeline simultaneously depended on the <strong data-start="554" data-end="581">LLaVA GitHub repository</strong> (<code data-start="583" data-end="602"><span style="font-size: 10.0pt;">haotian-liu/LLaVA</span></code>) and on <strong data-start="611" data-end="652">Hugging Face–hosted model checkpoints</strong>. The GitHub repository is a research-oriented codebase under active development. Its internal APIs, class structures, and utilities evolve rapidly and are not version-locked. Constructors, module paths, and helper functions may change or disappear without notice, and the repository is not designed to maintain backward compatibility across releases.</p>
<p data-start="1005" data-end="1528">At the same time, pretrained model weights were downloaded from Hugging Face. These checkpoints follow the <strong data-start="1112" data-end="1153">Transformers-native multimodal format</strong>, using Hugging Face–specific configuration files, processors, and model classes (e.g., <code data-start="1241" data-end="1276"><span style="font-size: 10.0pt;">LlavaNextForConditionalGeneration</span></code>, <code data-start="1278" data-end="1293"><span style="font-size: 10.0pt;">AutoProcessor</span></code>, and chat templates). This architecture is fundamentally different from the internal design assumed by the GitHub LLaVA code, which relies on custom token insertion, internal vision tower management, and non-Transformers abstractions.</p>
<p data-start="1530" data-end="1846">As a result, the pipeline operated in a <strong data-start="1570" data-end="1593">structural mismatch</strong>: GitHub code expected architectural fields, model attributes, and tokenizer behavior that were not present in Hugging Face checkpoints, while Hugging Face checkpoints expected model wrappers and configuration logic that the GitHub code did not provide.</p>
<p data-start="1848" data-end="2245">This fragility was exposed when <strong data-start="1880" data-end="1929">Google Colab upgraded its backend environment</strong> in early 2025. Major changes included Python 3.12, NumPy ≥ 2.0 (introducing ABI-breaking changes for compiled extensions), newer PyTorch releases (≥ 2.2), and updated system libraries. These updates caused widespread failures in binary dependencies and research codebases that were not aligned with the new runtime.</p>
<p data-start="2247" data-end="2577">In practice, this led to errors such as NumPy ABI incompatibilities, PyTorch extension failures, missing or renamed modules, and import errors in LLaVA GitHub utilities. Because the pipeline depended on both unstable research code and binary-sensitive extensions, even minor environment updates were sufficient to break execution.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-26 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">2. Refactoring of the Inference Engine in SAGAI v1.1</h2></div><div class="fusion-text fusion-text-49 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p style="text-align: justify;">Module 3 has been fully refactored to <strong data-start="341" data-end="406">remove any dependency on the original LLaVA GitHub repository</strong>. The inference pipeline now relies exclusively on <strong data-start="457" data-end="502">Hugging Face–native LLaVA models and APIs</strong>, ensuring long-term stability and compatibility with evolving software environments.</p>
<p style="text-align: justify;" data-start="589" data-end="1175">In the previous architecture, the script depended on cloning the LLaVA GitHub repository, installing it in editable mode, and importing internal modules (<code data-start="743" data-end="752"><span style="font-size: 10.0pt;">llava.*</span></code>). Prompts were manually assembled using LLaVA-specific multimodal tokens (e.g., <code data-start="833" data-end="845"><span style="font-size: 10.0pt;">&lt;im_start&gt;</span></code>, <code data-start="847" data-end="856"><span style="font-size: 10.0pt;">&lt;image&gt;</span></code>), custom separators, and internal utilities. Image tokens and embeddings were explicitly inserted into the prompt, tightly coupling the forward pass to a specific implementation of the LLaVA codebase. As a result, updates to Google Colab, PyTorch, NumPy, or the LLaVA repository frequently introduced breaking changes.</p>
<p style="text-align: justify;" data-start="1177" data-end="1752">The current implementation removes all such dependencies. Prompt formatting and multimodal input construction are now handled entirely through Hugging Face abstractions. Prompts are formatted using <code data-start="1375" data-end="1408"><span style="font-size: 10.0pt;">processor.apply_chat_template()</span></code>, while images and text are combined using <code data-start="1451" data-end="1480"><span style="font-size: 10.0pt;">processor(images=…, text=…)</span></code>. Image embedding alignment, multimodal token placement, and chat formatting are fully managed by the Hugging Face processor and model configuration. Inference is performed using the standard <code data-start="1672" data-end="1690"><span style="font-size: 10.0pt;">model.generate()</span></code> API, without any custom token handling or internal utilities.</p>
<p style="text-align: justify;" data-start="1754" data-end="2177">This refactoring makes the SAGAI inference engine <strong data-start="1804" data-end="1862">model-agnostic within the Hugging Face LLaVA ecosystem</strong>. The same forward pass is compatible with LLaVA-NeXT (v1.6), LLaVA-Interleave, LLaVA-OneVision, and future Hugging Face LLaVA releases that expose a processor and chat template. Switching between models or architectures requires only changing the <code data-start="2110" data-end="2120"><span style="font-size: 10.0pt;">model_id</span></code>, with no modification to prompt logic or inference code.</p>
<p style="text-align: justify;" data-start="2179" data-end="2639">To ensure reliable downstream analysis, Module 3 also includes a dedicated <strong data-start="2254" data-end="2291">numeric output stabilization step</strong>. After decoding the model response, any prompt echoes or metadata—including residual <code data-start="2377" data-end="2395"><span style="font-size: 10.0pt;">[INST] … [/INST]</span></code> segments—are removed. The final output is parsed using a simple regular expression to retain only numeric values (e.g., <code data-start="2516" data-end="2519"><span style="font-size: 10.0pt;">0</span></code>, <code data-start="2521" data-end="2524"><span style="font-size: 10.0pt;">1</span></code>, <code data-start="2526" data-end="2529"><span style="font-size: 10.0pt;">2</span></code>, <code data-start="2531" data-end="2536"><span style="font-size: 10.0pt;">1.5</span></code>). This guarantees clean, machine-readable outputs and a stable CSV format across all supported models.</p>
<p style="text-align: justify;" data-start="2641" data-end="3230">Model loading has been simplified and standardized using Hugging Face–approved APIs. Both the processor and the model are instantiated directly from Hugging Face model cards via <code data-start="2819" data-end="2836"><span style="font-size: 10.0pt;">from_pretrained</span></code>, with optional 4-bit quantization enabled through <code data-start="2887" data-end="2906"><span style="font-size: 10.0pt;">load_in_4bit=True</span></code>. This eliminates the need for manual vision-tower initialization, deprecated classes, or custom C++ operators, and avoids common incompatibilities related to PyTorch, CUDA, or NumPy upgrades in Google Colab. Official Hugging Face code paths ensure that pretrained weights are always matched with the correct implementation.</p>
<p style="text-align: justify;" data-start="3232" data-end="3456">Optional authentication using a Hugging Face access token is supported to avoid rate limits and improve download reliability when working with large checkpoints, though public models remain accessible without authentication.</p>
<p style="text-align: justify;" data-start="3458" data-end="3697">Overall, this refactoring significantly improves <strong data-start="3507" data-end="3559">robustness, reproducibility, and maintainability</strong>, while enabling systematic experimentation across multiple LLaVA variants and quantization settings within a unified inference framework.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-27 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">3. Rationale for a Long-Term, Stable Release</h2></div><div class="fusion-text fusion-text-50 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><p style="text-align: justify;">The refactored inference system in Module 3 is designed as a <strong data-start="332" data-end="371">long-term, maintenance-safe release</strong>. This is achieved by aligning the entire pipeline with Hugging Face’s officially supported multimodal APIs and model distribution mechanisms.</p>
<p style="text-align: justify;" data-start="560" data-end="1128">First, the new architecture is <strong data-start="591" data-end="637">robust to Google Colab environment updates</strong>. All critical dependencies—Python (≥3.12), NumPy (≥2.0), PyTorch (2.x), CUDA wheels, and BitsAndBytes quantization—are now managed through Hugging Face Transformers and its dependency resolution. Because the model code, processor logic, and quantization pathways are maintained upstream, updates to Colab or its underlying libraries no longer break the inference pipeline. As long as Hugging Face continues to support the model card, the code remains functional without manual intervention.</p>
<p style="text-align: justify;" data-start="1130" data-end="1617">Second, the system relies exclusively on <strong data-start="1171" data-end="1218">official Hugging Face–maintained components</strong>. Core classes such as <code data-start="1241" data-end="1276"><span style="font-size: 10.0pt;">LlavaNextForConditionalGeneration</span></code>, <code data-start="1278" data-end="1298"><span style="font-size: 10.0pt;">LlavaNextProcessor</span></code>, chat templates, and multimodal preprocessing logic are all part of the Transformers library. These components are actively maintained, versioned, and tested by Hugging Face, providing a level of stability and backward compatibility that is not guaranteed when relying on research repositories or development branches.</p>
<p style="text-align: justify;" data-start="1619" data-end="2162">Third, the new setup significantly improves <strong data-start="1663" data-end="1682">reproducibility</strong>. Each run explicitly references a fixed Hugging Face model checkpoint via the <code data-start="1761" data-end="1771"><span style="font-size: 10.0pt;">model_id</span></code>, ensuring that the same weights, architecture, and prompt template are used across sessions and machines. In addition, generation parameters (sampling strategy, temperature, nucleus sampling, and output length) are explicitly defined, enabling consistent and repeatable results across runs.</p>
<p style="text-align: justify;" data-start="2164" data-end="2626">Fourth, the architecture is <strong data-start="2192" data-end="2230">easy to extend and experiment with</strong>. Switching between different LLaVA variants now requires changing a single configuration line (<code data-start="2326" data-end="2336"><span style="font-size: 10.0pt;">model_id</span></code>). The same inference code supports LLaVA 1.5 models, LLaVA-NeXT (v1.6), Interleave models, OneVision models, and larger checkpoints (e.g., 13B or 34B), including variants based on Mistral, Vicuna, Qwen, or Yi backbones. No changes to prompt construction or forward-pass logic are required.</p>
<p style="text-align: justify;" data-start="2628" data-end="3091">Finally, the multimodal pipeline is now <strong data-start="2668" data-end="2716">cleanly abstracted and internally consistent</strong>. Hugging Face handles all low-level details, including image preprocessing, chat formatting, positional embeddings, image sequence length management, and attention masking. This eliminates a large class of subtle bugs related to tensor alignment and multimodal token placement, while ensuring that the vision and language components remain synchronized across model updates.</p>
</div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:25px;margin-bottom:25px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div><div class="fusion-title title fusion-title-28 fusion-sep-none fusion-title-text fusion-title-size-two" style="--awb-margin-top:25px;--awb-margin-bottom:25px;"><h2 class="fusion-title-heading title-heading-left fusion-responsive-typography-calculated" style="margin:0;--fontSize:48;line-height:var(--awb-typography1-line-height);">4. References and links</h2></div><div class="fusion-text fusion-text-51 fusion-text-no-margin" style="--awb-content-alignment:justify;--awb-margin-top:25px;--awb-margin-bottom:25px;"><ul>
<li style="text-align: justify;">
<p class="heading-element" dir="auto" tabindex="-1">Streetscape Analysis with Generative AI (SAGAI) on Github with v1.1 update. <a class="keychainify-checked" href="https://github.com/perezjoan/SAGAI">https://github.com/perezjoan/SAGAI</a></p>
</li>
<li>Perez, J and Fusco, G. (2025) <em>Streetscape Analysis with Generative AI (SAGAI): Vision-Language Assessment and Mapping of Urban Scenes</em>. Geomatica, 77(2), 100063, 18p. Available at: <a class="keychainify-checked" href="https://www.sciencedirect.com/science/article/pii/S1195103625000199" rel="nofollow">https://www.sciencedirect.com/science/article/pii/S1195103625000199</a></li>
</ul>
</div></div></div><div class="fusion-layout-column fusion_builder_column fusion-builder-column-7 awb-sticky awb-sticky-medium awb-sticky-large fusion_builder_column_1_4 1_4 fusion-flex-column" style="--awb-padding-top:20px;--awb-padding-right:20px;--awb-padding-bottom:20px;--awb-padding-left:20px;--awb-bg-size:cover;--awb-border-color:var(--awb-color6);--awb-border-style:solid;--awb-width-large:25%;--awb-margin-top-large:0px;--awb-spacing-right-large:7.68%;--awb-margin-bottom-large:20px;--awb-spacing-left-large:7.68%;--awb-width-medium:25%;--awb-order-medium:0;--awb-spacing-right-medium:7.68%;--awb-spacing-left-medium:7.68%;--awb-width-small:100%;--awb-order-small:0;--awb-spacing-right-small:1.92%;--awb-spacing-left-small:1.92%;--awb-sticky-offset:150px;" data-scroll-devices="small-visibility,medium-visibility,large-visibility"><div class="fusion-column-wrapper fusion-column-has-shadow fusion-flex-justify-content-flex-start fusion-content-layout-column"><div class="fusion-text fusion-text-52"><p><span style="color: #143c4e;"><strong>Table of contents</strong></span></p>
</div><div class="awb-toc-el awb-toc-el--4" data-awb-toc-id="4" data-awb-toc-options="{&quot;allowed_heading_tags&quot;:{&quot;h2&quot;:0},&quot;ignore_headings&quot;:&quot;&quot;,&quot;ignore_headings_words&quot;:&quot;&quot;,&quot;enable_cache&quot;:&quot;no&quot;,&quot;highlight_current_heading&quot;:&quot;yes&quot;,&quot;hide_hidden_titles&quot;:&quot;no&quot;,&quot;limit_container&quot;:&quot;page_content&quot;,&quot;select_custom_headings&quot;:&quot;.contenu H2, .contenu H3&quot;,&quot;icon&quot;:&quot;fa-flag fas&quot;,&quot;counter_type&quot;:&quot;none&quot;}" style="--awb-item-padding-right:5px;--awb-item-padding-left:5px;"><div class="awb-toc-el__content"></div></div><div class="fusion-separator fusion-full-width-sep" style="align-self: center;margin-left: auto;margin-right: auto;margin-top:20px;margin-bottom:20px;width:100%;"><div class="fusion-separator-border sep-single sep-solid" style="--awb-height:20px;--awb-amount:20px;--awb-sep-color:var(--awb-color6);border-color:var(--awb-color6);border-top-width:1px;"></div></div></div></div></div></div>
<p>The post <a href="https://urbangeoanalytics.com/a-stable-and-reproducible-vision-language-inference-engine-for-sagai-v1-1/">A Stable and Reproducible Vision–Language Inference Engine for SAGAI v1.1</a> appeared first on <a href="https://urbangeoanalytics.com">Urban Geo Analytics</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://urbangeoanalytics.com/a-stable-and-reproducible-vision-language-inference-engine-for-sagai-v1-1/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
	</channel>
</rss>
