Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Vision-Language Models

Introduction

Learning Objectives

How Vision-Language Models Work

Setting Up the Environment

# %pip install geoai-py transformers==4.57.6
import geoai
import leafmap
from geoai import MoondreamGeo

Sample Data

url = "https://data.source.coop/opengeos/geoai/parking-lot.tif"
image_path = geoai.download_file(url)
m = leafmap.Map()
m.add_raster(image_path, layer_name="Satellite Image")
m

Initializing the Moondream Processor

processor = MoondreamGeo(
    model_name="vikhyatk/moondream2",
    revision="2025-06-21",
)

Image Captioning

result = processor.caption(image_path, length="short")
print(result["caption"])
result = processor.caption(image_path, length="normal")
print(result["caption"])
result = processor.caption(image_path, length="long")
print(result["caption"])

Visual Question Answering

result = processor.query("How many buildings are in the image?", image_path)
print(result["answer"])
result = processor.query("What are the building roof colors?", image_path)
print(result["answer"])
result = processor.query("What types of vehicles are visible in the parking areas?", image_path)
print(result["answer"])

Object Detection and Point Localization

Detect Buildings

result = processor.detect(image_path, "building", output_path="buildings.geojson")
print(f"Detected {len(result['objects'])} buildings")
result["gdf"]
style = {"color": "red", "weight": 2}
m.add_gdf(result["gdf"], layer_name="Buildings", style=style)
m

Locate Building Centroids

result = processor.point(
    image_path, "building", output_path="building_centroids.geojson"
)
print(f"Found {len(result['points'])} building centroids")
m.add_gdf(result["gdf"], layer_name="Building Centroids")
m

Detect Trees

result = processor.detect(image_path, "tree", output_path="trees.geojson")
print(f"Detected {len(result['objects'])} trees")
m.add_gdf(result["gdf"], layer_name="Trees", style={"color": "green", "weight": 2})

Locate Tree Centroids

result = processor.point(image_path, "tree", output_path="tree_centroids.geojson")
print(f"Found {len(result['points'])} tree centroids")
m.add_gdf(result["gdf"], layer_name="Tree Centroids")
m

Interactive GUI

moondream = MoondreamGeo(
    model_name="vikhyatk/moondream2",
    revision="2025-06-21",
)
moondream.load_image(image_path)
m_gui = moondream.show_gui()
m_gui
gdf = m_gui.last_result_as_gdf
gdf

Sliding Window Analysis for Large Rasters

Object Detection with Sliding Window

result = processor.detect_sliding_window(
    image_path,
    "car",
    window_size=512,
    overlap=64,
    iou_threshold=0.5,
    output_path="cars_sliding_window.geojson",
)
print(f"Detected {len(result['objects'])} cars")
result["gdf"].head()
m2 = leafmap.Map()
m2.add_raster(image_path, layer_name="Satellite Image")
m2.add_gdf(
    result["gdf"],
    layer_name="Detected Cars",
    style={"color": "red", "fillOpacity": 0.3},
)
m2

Point Detection with Sliding Window

trees = processor.point_sliding_window(
    image_path,
    "tree",
    window_size=512,
    overlap=64,
    output_path="trees_sliding_window.geojson",
)
print(f"Found {len(trees['points'])} tree locations")
m3 = leafmap.Map()
m3.add_raster(image_path, layer_name="Satellite Image")
m3.add_gdf(trees["gdf"], layer_name="Trees", style={"color": "green", "radius": 3})
m3

Visual Question Answering with Sliding Window

result = processor.query_sliding_window(
    "What types of vehicles are visible?",
    image_path,
    window_size=512,
    overlap=64,
    combine_strategy="concatenate",
)
print(result["answer"])
result = processor.query_sliding_window(
    "Describe the land use and features in this area.",
    image_path,
    window_size=512,
    overlap=64,
    combine_strategy="summarize",
)
print(result["answer"])
for tile in result["tile_answers"][:2]:  # Show first 2 tiles
    print(f"Tile {tile['tile_id']}: {tile['answer']}\n")

Image Captioning with Sliding Window

result = processor.caption_sliding_window(
    image_path,
    window_size=512,
    overlap=64,
    length="normal",
    combine_strategy="concatenate",
)
print(result["caption"])
result = processor.caption_sliding_window(
    image_path,
    window_size=512,
    overlap=64,
    length="long",
    combine_strategy="summarize",
)
print(result["caption"])
geoai.empty_cache()

Convenience Functions

from geoai import moondream_detect_sliding_window

result = moondream_detect_sliding_window(
    image_path,
    "car",
    window_size=512,
    overlap=64,
    model_name="vikhyatk/moondream2",
    revision="2025-06-21",
)
print(f"Detected {len(result['objects'])} cars")
geoai.view_vector_interactive(result["gdf"], tiles=image_path)

Comparing Regular vs. Sliding Window Detection

processor = MoondreamGeo(
    model_name="vikhyatk/moondream2",
    revision="2025-06-21",
)
regular_result = processor.detect(image_path, "car")
print(f"Regular detection: {len(regular_result['objects'])} cars")

sliding_result = processor.detect_sliding_window(
    image_path, "car", window_size=512, overlap=64
)
print(f"Sliding window detection: {len(sliding_result['objects'])} cars")
geoai.empty_cache()

Performance Tips

CLIP-Based Segmentation

clip_image_url = "https://data.source.coop/opengeos/geoai/uc-berkeley.tif"
clip_image_path = geoai.download_file(clip_image_url)
geoai.view_raster(clip_image_path)
segmenter = geoai.CLIPSegmentation(tile_size=512, overlap=32)
mask_output_path = "tree_masks.tif"
text_prompt = "trees"
segmenter.segment_image(
    clip_image_path,
    output_path=mask_output_path,
    text_prompt=text_prompt,
    threshold=0.5,
    smoothing_sigma=1.0,
)
geoai.view_raster(
    mask_output_path,
    nodata=0,
    opacity=0.7,
    colormap="greens",
    layer_name="Trees",
    basemap=clip_image_path,
)
geoai.create_split_map(
    left_layer=mask_output_path,
    right_layer=clip_image_path,
    left_label="Trees",
    right_label="Satellite Image",
    left_args={"nodata": 0, "opacity": 0.8, "colormap": "greens"},
    basemap=clip_image_path,
)

Practical Applications in Earth Observation

Limitations and Considerations

Key Takeaways

Exercises

Exercise 1: Caption Length Comparison

Exercise 2: Geospatial Visual Question Answering

Exercise 3: Object Detection and Counting

Exercise 4: Multi-Class CLIP Segmentation