Unaccusativity Syntax or Picture Difficulty?

Full Computational Analysis Pipeline

This notebook replicates the analysis of visual stimuli using CLIP and Qwen-VL, including Bayesian regressions for similarity, salience, and ordinal verification scores.

# 1. Environment Setup
!sudo apt-get update
!sudo apt-get install -y libvips-dev
!pip install pyvips git+https://github.com/openai/CLIP.git
!pip install transformers torch torchvision accelerate sentencepiece pillow pandas numpy matplotlib seaborn scikit-learn scipy pyro-ppl

import os
import torch
import clip
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set up plotting style
sns.set_style("whitegrid")
# plt.rcParams['figure.figsize'] = (10, 6)

if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = "cpu" 
else:
    device = "cpu"

model_clip, preprocess = clip.load("ViT-B/32", device=device, jit=False)

print(f"Using device: {device}")
print(f"CLIP model loaded successfully!")

rom transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import transformers
import torch
from transformers.generation.beam_search import BeamSearchScorer
transformers.BeamSearchScorer = BeamSearchScorer

# Load Qwen-VL-Chat model
model_id = "Qwen/Qwen-VL-Chat"

model_vlm = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    dtype=torch.float32
).to('cpu')
tokenizer_vlm = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# Create the streamer
streamer = TextStreamer(tokenizer_vlm, skip_prompt=True)

Data & Model Loading

If you do not have the image files locally, this notebook will attempt to load cached_scores.csv if available.

# Unergative scenes
df_unerg = pd.DataFrame({
    "Filename": [
        "./pictures/octopus_swim.jpg",
        "./pictures/ballerina_run.jpg",
        "./pictures/boy_float.jpg",
        "./pictures/chef_yell.jpg",
        "./pictures/clown_walk.jpg",
        "./pictures/cowboy_wink.jpg",
        "./pictures/dog_bark.jpg",
        "./pictures/monkey_sleep.jpg",
        "./pictures/penguin_sneeze.jpg",
        "./pictures/pirate_cough.jpg",
        "./pictures/rabbit_smile.jpg",
        "./pictures/snail_crawl.jpg",
    ],
    "Sentence": [
        "The octopus is swimming.",
        "The ballerina is running.",
        "The boy is floating.",
        "The chef is yelling.",
        "The clown is walking.",
        "The cowboy is winking.",
        "The dog is barking.",
        "The monkey is sleeping.",
        "The penguin is sneezing.",
        "The pirate is coughing.",
        "The rabbit is smiling.",
        "The snail is crawling.",
    ]
})

# Unaccusative scenes
df_unacc = pd.DataFrame({
    "Filename": [
        "./pictures/octopus_boil.jpg",
        "./pictures/ballerina_shrink.jpg",
        "./pictures/boy_yawn.jpg",
        "./pictures/chef_drown.jpg",
        "./pictures/clown_grow.jpg",
        "./pictures/cowboy_fall.jpg",
        "./pictures/dog_spin.jpg",
        "./pictures/monkey_trip.jpg",
        "./pictures/penguin_bounce.jpg",
        "./pictures/pirate_sink.jpg",
        "./pictures/rabbit_shake.jpg",
        "./pictures/snail_melt.jpg",
    ],
    "Sentence": [
        "The octopus is boiling.",
        "The ballerina is shrinking.",
        "The boy is yawning.",
        "The chef is drowning.",
        "The clown is growing.",
        "The cowboy is falling.",
        "The dog is spinning.",
        "The monkey is tripping.",
        "The penguin is bouncing.",
        "The pirate is sinking.",
        "The rabbit is shaking.",
        "The snail is melting.",
    ]
})

Bayesian Regressions (Normal & Ordinal Logistic)

We model the effect of Verb Type across three metrics [cite: 270-272]: 1. Full Scene (CLIP): Linear Regression 2. Subject Salience (CLIP): Linear Regression 3. Scene Verification (VLM): Ordered Logistic Regression (Ordinal Match 1-10)

def compute_clip_similarity(df, model, preprocess, device):
    """
    Compute CLIP similarity scores for image-text pairs.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with 'Filename' and 'Sentence' columns
    model : CLIP model
        Loaded CLIP model
    preprocess : function
        CLIP preprocessing function
    device : str
        'cuda' or 'cpu'

    Returns:
    --------
    pandas.DataFrame
        Original dataframe with added 'CLIP_Similarity' column
    """
    similarity_scores = []

    for _, row in df.iterrows():
        img_path = row['Filename']
        text = row['Sentence']

        # Preprocess image and tokenize text
        img = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
        text_tokenized = clip.tokenize([text]).to(device)

        # Compute similarity
        with torch.no_grad():
            logits_per_image, _ = model(img, text_tokenized)
            similarity_score = logits_per_image.item()

        similarity_scores.append(similarity_score)

    # Add scores to dataframe
    df_copy = df.copy()
    df_copy['CLIP_Similarity'] = similarity_scores

    return df_copy

def compute_subject_salience(df, model, preprocess, device):
    """
    Compute CLIP similarity scores for subject noun alone.
    This measures how visually salient/easy to identify the subject is.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with 'Filename' and 'Sentence' columns
    model : CLIP model
        Loaded CLIP model
    preprocess : function
        CLIP preprocessing function
    device : str
        'cuda' or 'cpu'
    
    Returns:
    --------
    pandas.DataFrame
        Original dataframe with added 'Subject_Salience' column
    """
    subject_scores = []
    
    for _, row in df.iterrows():
        img_path = row['Filename']
        sentence = row['Sentence']
        
        # Extract subject noun (assumes format "The X is ...")
        # Extract word after "The " and before " is"
        subject = sentence.split("The ")[1].split(" is")[0]
        
        # Preprocess image and tokenize subject
        img = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
        text_tokenized = clip.tokenize([subject]).to(device)
        
        # Compute similarity
        with torch.no_grad():
            logits_per_image, _ = model(img, text_tokenized)
            similarity_score = logits_per_image.item()
        
        subject_scores.append(similarity_score)
    
    df_copy = df.copy()
    df_copy['Subject_Salience'] = subject_scores
    
    return df_copy

def compute_qwen_scores(df, model, tokenizer, streamer=None):
    """
    Compute verification scores using Qwen-VL-Chat multimodal LLM.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with 'Filename' and 'Sentence' columns
    model : Qwen-VL-Chat model
        Loaded Qwen model
    tokenizer : AutoTokenizer
        Qwen tokenizer
    streamer : TextStreamer, optional
        Streamer for real-time output

    Returns:
    --------
    pandas.DataFrame
        Original dataframe with added 'VLM_Score' and 'VLM_Response' columns
    """
    import re
    scores = []
    responses = []

    for idx, row in df.iterrows():
        img_path = row['Filename']
        sentence = row['Sentence']

        # Create query for Qwen-VL-Chat
        query = tokenizer.from_list_format([
            {'image': img_path},
            {'text': f'Rate how well this sentence describes the image: "{sentence}"\nScore from 1-10 (1=mismatch, 10=perfect match). Reply with just the number.'},
        ])

        # Generate response
        with torch.no_grad():
            response, _ = model.chat(tokenizer, query=query, history=None, streamer=streamer)

        # Extract numeric score
        try:
            match = re.search(r'(\d+(?:\.\d+)?)', response)
            score = float(match.group(1)) if match else 5.0
            score = min(10.0, max(1.0, score))  # Clamp to 1-10
        except:
            score = 5.0

        scores.append(score)
        responses.append(response)

    df_copy = df.copy()
    df_copy['VLM_Score'] = scores
    df_copy['VLM_Response'] = responses

    return df_copy

import os

CACHE_FILE = "./cached_scores.csv"


if os.path.exists(CACHE_FILE):
    df_all = pd.read_csv(CACHE_FILE)
else:
    # Compute CLIP similarities
    df_unerg_clip = compute_clip_similarity(df_unerg, model_clip, preprocess, device)
    df_unacc_clip = compute_clip_similarity(df_unacc, model_clip, preprocess, device)
    
    # Compute subject salience scores
    df_unerg_subj = compute_subject_salience(df_unerg, model_clip, preprocess, device)
    df_unacc_subj = compute_subject_salience(df_unacc, model_clip, preprocess, device)

    # Compute Qwen-VL scores
    df_unerg_vlm = compute_qwen_scores(df_unerg, model_vlm, tokenizer_vlm, streamer=streamer)
    df_unacc_vlm = compute_qwen_scores(df_unacc, model_vlm, tokenizer_vlm, streamer=streamer)

    # Combine CLIP scores with VLM scores and subject salience
    df_unerg_scored = df_unerg_clip.copy()
    df_unerg_scored['Subject_Salience'] = df_unerg_subj['Subject_Salience']
    df_unerg_scored['VLM_Score'] = df_unerg_vlm['VLM_Score']
    df_unerg_scored['VLM_Response'] = df_unerg_vlm['VLM_Response']
    df_unerg_scored['VerbType'] = 'Unergative'

    df_unacc_scored = df_unacc_clip.copy()
    df_unacc_scored['Subject_Salience'] = df_unacc_subj['Subject_Salience']
    df_unacc_scored['VLM_Score'] = df_unacc_vlm['VLM_Score']
    df_unacc_scored['VLM_Response'] = df_unacc_vlm['VLM_Response']
    df_unacc_scored['VerbType'] = 'Unaccusative'

    # Combine for analysis
    df_all = pd.concat([df_unerg_scored, df_unacc_scored], ignore_index=True)

    # Save to cache
    df_all.to_csv(CACHE_FILE, index=False)

print(df_all.head())

# Create comparison plot with all three metrics
fig, axes = plt.subplots(1, 3, figsize=(8, 3))

# CLIP full sentence results
sns.pointplot(data=df_all, x='VerbType', y='CLIP_Similarity',
              hue='VerbType', palette=['#3498db', '#e74c3c'], 
              ax=axes[0], errorbar='ci', capsize=0.1, 
              linestyle='none', markers='o', legend=False)
sns.stripplot(data=df_all, x='VerbType', y='CLIP_Similarity',
              color='black', alpha=0.5, size=8, ax=axes[0], jitter=0.2)

axes[0].set_xlabel('Verb Type', fontsize=14, fontweight='bold')
axes[0].set_ylabel('CLIP Similarity Score', fontsize=14, fontweight='bold')
axes[0].set_title('Full Sentence Similarity',
                  fontsize=16, fontweight='bold', pad=20)

for verb_type in ['Unergative', 'Unaccusative']:
    mean_val = df_all[df_all['VerbType'] == verb_type]['CLIP_Similarity'].mean()
    axes[0].text(0 if verb_type == 'Unergative' else 1, mean_val + 1,
                 f'M = {mean_val:.2f}', ha='center', fontsize=12, fontweight='bold')

# Subject salience results
sns.pointplot(data=df_all, x='VerbType', y='Subject_Salience',
              hue='VerbType', palette=['#3498db', '#e74c3c'], 
              ax=axes[1], errorbar='ci', capsize=0.1, 
              linestyle='none', markers='o', legend=False)
sns.stripplot(data=df_all, x='VerbType', y='Subject_Salience',
              color='black', alpha=0.5, size=8, ax=axes[1], jitter=0.2)

axes[1].set_xlabel('Verb Type', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Subject Salience Score', fontsize=14, fontweight='bold')
axes[1].set_title('Subject Noun Identifiability',
                  fontsize=16, fontweight='bold', pad=20)

for verb_type in ['Unergative', 'Unaccusative']:
    mean_val = df_all[df_all['VerbType'] == verb_type]['Subject_Salience'].mean()
    axes[1].text(0 if verb_type == 'Unergative' else 1, mean_val + 0.5,
                 f'M = {mean_val:.2f}', ha='center', fontsize=12, fontweight='bold')

# VLM results
sns.pointplot(data=df_all, x='VerbType', y='VLM_Score',
              hue='VerbType', palette=['#3498db', '#e74c3c'], 
              ax=axes[2], errorbar='ci', capsize=0.1, 
              linestyle='none', markers='o', legend=False)
sns.stripplot(data=df_all, x='VerbType', y='VLM_Score',
              color='black', alpha=0.5, size=8, ax=axes[2], jitter=0.2)

axes[2].set_xlabel('Verb Type', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Qwen-VL Match Score (1-10)', fontsize=14, fontweight='bold')
axes[2].set_title('Scene Verification (Qwen-VL)',
                  fontsize=16, fontweight='bold', pad=20)

for verb_type in ['Unergative', 'Unaccusative']:
    mean_val = df_all[df_all['VerbType'] == verb_type]['VLM_Score'].mean()
    axes[2].text(0 if verb_type == 'Unergative' else 1, mean_val + 0.3,
                 f'M = {mean_val:.2f}', ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('./model_comparison_plot.png', dpi=300, bbox_inches='tight')
plt.show()

import torch
import pyro
import pyro.distributions as dist
from pyro.infer import MCMC, NUTS

# Prepare data for Pyro
# We'll center the scores and code VerbType numerically
df_pyro = df_all.copy()
df_pyro['VerbType_num'] = df_pyro['VerbType'].map({'Unergative': -0.5, 'Unaccusative': 0.5})
df_pyro['CLIP_centered'] = df_pyro['CLIP_Similarity'] - df_pyro['CLIP_Similarity'].mean()
df_pyro['Subject_centered'] = df_pyro['Subject_Salience'] - df_pyro['Subject_Salience'].mean()
vlm_score_tensor = torch.tensor(df_pyro['VLM_Score'].values, dtype=torch.long)

# Convert to tensors
verb_type_tensor = torch.tensor(df_pyro['VerbType_num'].values, dtype=torch.float32)
clip_tensor = torch.tensor(df_pyro['CLIP_centered'].values, dtype=torch.float32)
subject_tensor = torch.tensor(df_pyro['Subject_centered'].values, dtype=torch.float32)

# --- Model for CLIP Similarity ---
def clip_model(verb_type, obs=None):
    intercept = pyro.sample('intercept', dist.Normal(0., 10.))
    beta = pyro.sample('beta', dist.Normal(0., 10.))
    sigma = pyro.sample('sigma', dist.HalfNormal(10.))
    mu = intercept + beta * verb_type
    with pyro.plate('data', len(verb_type)):
        pyro.sample('obs', dist.Normal(mu, sigma), obs=obs)

# --- Model for Subject Salience ---
def subject_model(verb_type, obs=None):
    intercept = pyro.sample('intercept', dist.Normal(0., 10.))
    beta = pyro.sample('beta', dist.Normal(0., 10.))
    sigma = pyro.sample('sigma', dist.HalfNormal(10.))
    mu = intercept + beta * verb_type
    with pyro.plate('data', len(verb_type)):
        pyro.sample('obs', dist.Normal(mu, sigma), obs=obs)
        
# --- Model for VLM Score (Ordered Logistic) ---
k_categories = vlm_score_tensor.max().item() + 1
k_cutpoints = k_categories - 1
def vlm_model(verb_type, obs=None):
    alpha = pyro.sample('alpha', dist.Normal(0., 10.))
    beta = pyro.sample('beta', dist.Normal(0., 10.))
    with pyro.plate("cutpoints_plate", k_cutpoints):
        raw_cutpoints = pyro.sample('raw_cutpoints', dist.Normal(torch.arange(k_cutpoints).float(), 1.))
    cutpoints = torch.sort(raw_cutpoints)[0]
    latent_propensity = alpha + beta * verb_type
    with pyro.plate('data', len(verb_type)):
        pyro.sample('obs', dist.OrderedLogistic(latent_propensity, cutpoints), obs=obs)

# Run the MCMC samplers
mcmc_clip = MCMC(NUTS(clip_model), num_samples=2000, warmup_steps=1000)
mcmc_clip.run(verb_type_tensor, clip_tensor)
clip_samples = mcmc_clip.get_samples()

mcmc_subject = MCMC(NUTS(subject_model), num_samples=2000, warmup_steps=1000)
mcmc_subject.run(verb_type_tensor, subject_tensor)
subject_samples = mcmc_subject.get_samples()

mcmc_vlm = MCMC(NUTS(vlm_model), num_samples=2000, warmup_steps=1000, num_chains=1)
mcmc_vlm.run(verb_type_tensor, vlm_score_tensor)
vlm_samples = mcmc_vlm.get_samples()

# Get posterior samples and print results
clip_beta_mean = clip_samples['beta'].mean().item()
clip_beta_hdi = torch.quantile(clip_samples['beta'], torch.tensor([0.025, 0.975]))

print(f"\nCLIP Similarity - Bayesian Regression:")
print(f"  Beta (VerbType effect): {clip_beta_mean:.3f}")
print(f"  95% HDI: [{clip_beta_hdi[0]:.3f}, {clip_beta_hdi[1]:.3f}]")
print(f"  P(beta < 0): {(clip_samples['beta'] < 0).float().mean():.3f}")

subject_beta_mean = subject_samples['beta'].mean().item()
subject_beta_hdi = torch.quantile(subject_samples['beta'], torch.tensor([0.025, 0.975]))

print(f"\nSubject Salience - Bayesian Regression:")
print(f"  Beta (VerbType effect): {subject_beta_mean:.3f}")
print(f"  95% HDI: [{subject_beta_hdi[0]:.3f}, {subject_beta_hdi[1]:.3f}]") 
print(f"  P(beta < 0): {(subject_samples['beta'] < 0).float().mean():.3f}")

vlm_beta_mean = vlm_samples['beta'].mean().item()
vlm_beta_hdi = torch.quantile(vlm_samples['beta'], torch.tensor([0.025, 0.975]))

print(f"\nVLM Score - Ordered Logistic Regression:")
print(f"  Beta (VerbType effect): {vlm_beta_mean:.3f}")
print(f"  95% HDI: [{vlm_beta_hdi[0]:.3f}, {vlm_beta_hdi[1]:.3f}]")
print(f"  P(beta < 0): {(vlm_samples['beta'] < 0).float().mean():.3f}"

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Data dictionary from your MCMC samples
beta_data = {
    'Full Scene (CLIP)': clip_samples['beta'].numpy(),
    'Subject Salience (CLIP)': subject_samples['beta'].numpy(),
    'Scene Verification (VLM)': vlm_samples['beta'].numpy()
}

# Adjust figure size for better vertical separation
fig, ax = plt.subplots(figsize=(8, 4))
sns.set_style("whitegrid", {'axes.grid': True, 'grid.color': '.95'})

labels = list(beta_data.keys())
colors = ['#3498db', '#9b59b6', '#e74c3c']

for i, label in enumerate(labels):
    samples = beta_data[label]
    mean_val = samples.mean()
    
    # 1. Calculate multiple intervals for the "stacking" effect
    hdi_95 = np.percentile(samples, [2.5, 97.5])
    hdi_80 = np.percentile(samples, [10, 90])
    hdi_50 = np.percentile(samples, [25, 75])
    
    # 2. Plot the stacked lines (Bottom to Top: thinnest/widest first)
    # 95% Interval - Thin
    ax.hlines(i, hdi_95[0], hdi_95[1], color=colors[i], linewidth=1.5, alpha=0.4, zorder=1)
    # 80% Interval - Medium
    ax.hlines(i, hdi_80[0], hdi_80[1], color=colors[i], linewidth=5.0, alpha=0.7, zorder=2)
    # 50% Interval - Thick
    ax.hlines(i, hdi_50[0], hdi_50[1], color=colors[i], linewidth=10.0, alpha=1.0, zorder=3)
    
    # 3. Plot the Mean point
    ax.plot(mean_val, i, 'o', color='white', markersize=8, zorder=4)
    
    # 4. Perfectly Aligned Statistics
    p_dir = (samples < 0).mean() if mean_val < 0 else (samples > 0).mean()
    prob_text = f"$P(\\beta {'<' if mean_val < 0 else '>' } 0) = {p_dir:.2f}$"
    
    # Locked to y-coordinate 'i' and x-coordinate 3.0 (outside plot area)
    ax.text(3.0, i, prob_text, va='center', ha='left', 
            fontsize=13, fontweight='bold', color=colors[i])

# 5. Descriptive Annotations (The "How to Read" Guide)
ax.axvline(x=0, color='black', linestyle='-', linewidth=1.5, alpha=0.6, zorder=0)

# Arrow pointing Left (Negative Beta)
ax.annotate('', xy=(-5, -1.0), xytext=(-0.5, -1.0),
            arrowprops=dict(arrowstyle="->", color='gray', lw=1.5))
ax.text(-2.75, -1.4, "Lower Scores for\nUnaccusatives", ha='center', color='gray', fontweight='bold')

# Arrow pointing Right (Positive Beta)
ax.annotate('', xy=(2.5, -1.0), xytext=(0.5, -1.0),
            arrowprops=dict(arrowstyle="->", color='gray', lw=1.5))
ax.text(1.5, -1.4, "Lower Scores for\nUnergatives", ha='center', color='gray', fontweight='bold')

# 6. Final Layout Polish
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels, fontweight='bold', fontsize=12)
ax.set_xlabel('Posterior Beta Weight (Unaccusative vs. Unergative)', fontsize=13, labelpad=45)

# Lock limits so text and arrows don't shift
ax.set_xlim(-6, 3)
ax.set_ylim(-1.5, len(labels) - 0.5)

sns.despine(left=True, bottom=True)
plt.subplots_adjust(right=0.75, bottom=0.2) # Make room for text on right and guide on bottom
plt.savefig('./model_pyro.png', dpi=300, bbox_inches='tight')
plt.show()