Vision-Language Models - GeoAI with Python

Introduction¶

Learning Objectives¶

How Vision-Language Models Work¶

Setting Up the Environment¶

# %pip install geoai-py transformers==4.57.6

import geoai
import leafmap
from geoai import MoondreamGeo

Sample Data¶

url = "https://data.source.coop/opengeos/geoai/parking-lot.tif"
image_path = geoai.download_file(url)

m = leafmap.Map()
m.add_raster(image_path, layer_name="Satellite Image")
m

Initializing the Moondream Processor¶

processor = MoondreamGeo(
    model_name="vikhyatk/moondream2",
    revision="2025-06-21",
)

Image Captioning¶

result = processor.caption(image_path, length="short")
print(result["caption"])

result = processor.caption(image_path, length="normal")
print(result["caption"])

result = processor.caption(image_path, length="long")
print(result["caption"])

Visual Question Answering¶

result = processor.query("How many buildings are in the image?", image_path)
print(result["answer"])

result = processor.query("What are the building roof colors?", image_path)
print(result["answer"])

result = processor.query("What types of vehicles are visible in the parking areas?", image_path)
print(result["answer"])

Object Detection and Point Localization¶

Detect Buildings¶

result = processor.detect(image_path, "building", output_path="buildings.geojson")
print(f"Detected {len(result['objects'])} buildings")

result["gdf"]

style = {"color": "red", "weight": 2}
m.add_gdf(result["gdf"], layer_name="Buildings", style=style)
m

Locate Building Centroids¶

result = processor.point(
    image_path, "building", output_path="building_centroids.geojson"
)
print(f"Found {len(result['points'])} building centroids")

m.add_gdf(result["gdf"], layer_name="Building Centroids")
m

Detect Trees¶

result = processor.detect(image_path, "tree", output_path="trees.geojson")
print(f"Detected {len(result['objects'])} trees")

m.add_gdf(result["gdf"], layer_name="Trees", style={"color": "green", "weight": 2})

Locate Tree Centroids¶

result = processor.point(image_path, "tree", output_path="tree_centroids.geojson")
print(f"Found {len(result['points'])} tree centroids")

m.add_gdf(result["gdf"], layer_name="Tree Centroids")
m

Interactive GUI¶

moondream = MoondreamGeo(
    model_name="vikhyatk/moondream2",
    revision="2025-06-21",
)
moondream.load_image(image_path)
m_gui = moondream.show_gui()
m_gui

gdf = m_gui.last_result_as_gdf
gdf

Sliding Window Analysis for Large Rasters¶

Object Detection with Sliding Window¶

result = processor.detect_sliding_window(
    image_path,
    "car",
    window_size=512,
    overlap=64,
    iou_threshold=0.5,
    output_path="cars_sliding_window.geojson",
)
print(f"Detected {len(result['objects'])} cars")

result["gdf"].head()

m2 = leafmap.Map()
m2.add_raster(image_path, layer_name="Satellite Image")
m2.add_gdf(
    result["gdf"],
    layer_name="Detected Cars",
    style={"color": "red", "fillOpacity": 0.3},
)
m2

Point Detection with Sliding Window¶

trees = processor.point_sliding_window(
    image_path,
    "tree",
    window_size=512,
    overlap=64,
    output_path="trees_sliding_window.geojson",
)
print(f"Found {len(trees['points'])} tree locations")

m3 = leafmap.Map()
m3.add_raster(image_path, layer_name="Satellite Image")
m3.add_gdf(trees["gdf"], layer_name="Trees", style={"color": "green", "radius": 3})
m3

Visual Question Answering with Sliding Window¶

result = processor.query_sliding_window(
    "What types of vehicles are visible?",
    image_path,
    window_size=512,
    overlap=64,
    combine_strategy="concatenate",
)
print(result["answer"])

result = processor.query_sliding_window(
    "Describe the land use and features in this area.",
    image_path,
    window_size=512,
    overlap=64,
    combine_strategy="summarize",
)
print(result["answer"])

for tile in result["tile_answers"][:2]:  # Show first 2 tiles
    print(f"Tile {tile['tile_id']}: {tile['answer']}\n")

Image Captioning with Sliding Window¶

result = processor.caption_sliding_window(
    image_path,
    window_size=512,
    overlap=64,
    length="normal",
    combine_strategy="concatenate",
)
print(result["caption"])

result = processor.caption_sliding_window(
    image_path,
    window_size=512,
    overlap=64,
    length="long",
    combine_strategy="summarize",
)
print(result["caption"])

geoai.empty_cache()

Convenience Functions¶

from geoai import moondream_detect_sliding_window

result = moondream_detect_sliding_window(
    image_path,
    "car",
    window_size=512,
    overlap=64,
    model_name="vikhyatk/moondream2",
    revision="2025-06-21",
)
print(f"Detected {len(result['objects'])} cars")

geoai.view_vector_interactive(result["gdf"], tiles=image_path)

Comparing Regular vs. Sliding Window Detection¶

processor = MoondreamGeo(
    model_name="vikhyatk/moondream2",
    revision="2025-06-21",
)

regular_result = processor.detect(image_path, "car")
print(f"Regular detection: {len(regular_result['objects'])} cars")

sliding_result = processor.detect_sliding_window(
    image_path, "car", window_size=512, overlap=64
)
print(f"Sliding window detection: {len(sliding_result['objects'])} cars")

geoai.empty_cache()

Performance Tips¶

CLIP-Based Segmentation¶

clip_image_url = "https://data.source.coop/opengeos/geoai/uc-berkeley.tif"
clip_image_path = geoai.download_file(clip_image_url)

geoai.view_raster(clip_image_path)

segmenter = geoai.CLIPSegmentation(tile_size=512, overlap=32)

mask_output_path = "tree_masks.tif"
text_prompt = "trees"

segmenter.segment_image(
    clip_image_path,
    output_path=mask_output_path,
    text_prompt=text_prompt,
    threshold=0.5,
    smoothing_sigma=1.0,
)

geoai.view_raster(
    mask_output_path,
    nodata=0,
    opacity=0.7,
    colormap="greens",
    layer_name="Trees",
    basemap=clip_image_path,
)

geoai.create_split_map(
    left_layer=mask_output_path,
    right_layer=clip_image_path,
    left_label="Trees",
    right_label="Satellite Image",
    left_args={"nodata": 0, "opacity": 0.8, "colormap": "greens"},
    basemap=clip_image_path,
)

Introduction¶

Learning Objectives¶

How Vision-Language Models Work¶

Setting Up the Environment¶

Sample Data¶

Initializing the Moondream Processor¶

Image Captioning¶

Visual Question Answering¶

Object Detection and Point Localization¶

Detect Buildings¶

Locate Building Centroids¶

Detect Trees¶

Locate Tree Centroids¶

Interactive GUI¶

Sliding Window Analysis for Large Rasters¶

Object Detection with Sliding Window¶

Point Detection with Sliding Window¶

Visual Question Answering with Sliding Window¶

Image Captioning with Sliding Window¶

Convenience Functions¶

Comparing Regular vs. Sliding Window Detection¶

Performance Tips¶

CLIP-Based Segmentation¶

Practical Applications in Earth Observation¶

Limitations and Considerations¶

Key Takeaways¶

Exercises¶

Exercise 1: Caption Length Comparison¶

Exercise 2: Geospatial Visual Question Answering¶

Exercise 3: Object Detection and Counting¶

Exercise 4: Multi-Class CLIP Segmentation¶