# 1. Environment Setup
!sudo apt-get update
!sudo apt-get install -y libvips-dev
!pip install pyvips git+https://github.com/openai/CLIP.git
!pip install transformers torch torchvision accelerate sentencepiece pillow pandas numpy matplotlib seaborn scikit-learn scipy pyro-pplUnaccusativity Syntax or Picture Difficulty?
Full Computational Analysis Pipeline
This notebook replicates the analysis of visual stimuli using CLIP and Qwen-VL, including Bayesian regressions for similarity, salience, and ordinal verification scores.
import os
import torch
import clip
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForCausalLM, AutoTokenizer
# Set up plotting style
sns.set_style("whitegrid")
# plt.rcParams['figure.figsize'] = (10, 6)
if torch.cuda.is_available():
device = "cuda"
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
device = "cpu"
else:
device = "cpu"
model_clip, preprocess = clip.load("ViT-B/32", device=device, jit=False)
print(f"Using device: {device}")
print(f"CLIP model loaded successfully!")
rom transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import transformers
import torch
from transformers.generation.beam_search import BeamSearchScorer
transformers.BeamSearchScorer = BeamSearchScorer
# Load Qwen-VL-Chat model
model_id = "Qwen/Qwen-VL-Chat"
model_vlm = AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
dtype=torch.float32
).to('cpu')
tokenizer_vlm = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# Create the streamer
streamer = TextStreamer(tokenizer_vlm, skip_prompt=True)
Data & Model Loading
If you do not have the image files locally, this notebook will attempt to load cached_scores.csv if available.
# Unergative scenes
df_unerg = pd.DataFrame({
"Filename": [
"./pictures/octopus_swim.jpg",
"./pictures/ballerina_run.jpg",
"./pictures/boy_float.jpg",
"./pictures/chef_yell.jpg",
"./pictures/clown_walk.jpg",
"./pictures/cowboy_wink.jpg",
"./pictures/dog_bark.jpg",
"./pictures/monkey_sleep.jpg",
"./pictures/penguin_sneeze.jpg",
"./pictures/pirate_cough.jpg",
"./pictures/rabbit_smile.jpg",
"./pictures/snail_crawl.jpg",
],
"Sentence": [
"The octopus is swimming.",
"The ballerina is running.",
"The boy is floating.",
"The chef is yelling.",
"The clown is walking.",
"The cowboy is winking.",
"The dog is barking.",
"The monkey is sleeping.",
"The penguin is sneezing.",
"The pirate is coughing.",
"The rabbit is smiling.",
"The snail is crawling.",
]
})
# Unaccusative scenes
df_unacc = pd.DataFrame({
"Filename": [
"./pictures/octopus_boil.jpg",
"./pictures/ballerina_shrink.jpg",
"./pictures/boy_yawn.jpg",
"./pictures/chef_drown.jpg",
"./pictures/clown_grow.jpg",
"./pictures/cowboy_fall.jpg",
"./pictures/dog_spin.jpg",
"./pictures/monkey_trip.jpg",
"./pictures/penguin_bounce.jpg",
"./pictures/pirate_sink.jpg",
"./pictures/rabbit_shake.jpg",
"./pictures/snail_melt.jpg",
],
"Sentence": [
"The octopus is boiling.",
"The ballerina is shrinking.",
"The boy is yawning.",
"The chef is drowning.",
"The clown is growing.",
"The cowboy is falling.",
"The dog is spinning.",
"The monkey is tripping.",
"The penguin is bouncing.",
"The pirate is sinking.",
"The rabbit is shaking.",
"The snail is melting.",
]
})Bayesian Regressions (Normal & Ordinal Logistic)
We model the effect of Verb Type across three metrics [cite: 270-272]: 1. Full Scene (CLIP): Linear Regression 2. Subject Salience (CLIP): Linear Regression 3. Scene Verification (VLM): Ordered Logistic Regression (Ordinal Match 1-10)
def compute_clip_similarity(df, model, preprocess, device):
"""
Compute CLIP similarity scores for image-text pairs.
Parameters:
-----------
df : pandas.DataFrame
DataFrame with 'Filename' and 'Sentence' columns
model : CLIP model
Loaded CLIP model
preprocess : function
CLIP preprocessing function
device : str
'cuda' or 'cpu'
Returns:
--------
pandas.DataFrame
Original dataframe with added 'CLIP_Similarity' column
"""
similarity_scores = []
for _, row in df.iterrows():
img_path = row['Filename']
text = row['Sentence']
# Preprocess image and tokenize text
img = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
text_tokenized = clip.tokenize([text]).to(device)
# Compute similarity
with torch.no_grad():
logits_per_image, _ = model(img, text_tokenized)
similarity_score = logits_per_image.item()
similarity_scores.append(similarity_score)
# Add scores to dataframe
df_copy = df.copy()
df_copy['CLIP_Similarity'] = similarity_scores
return df_copy
def compute_subject_salience(df, model, preprocess, device):
"""
Compute CLIP similarity scores for subject noun alone.
This measures how visually salient/easy to identify the subject is.
Parameters:
-----------
df : pandas.DataFrame
DataFrame with 'Filename' and 'Sentence' columns
model : CLIP model
Loaded CLIP model
preprocess : function
CLIP preprocessing function
device : str
'cuda' or 'cpu'
Returns:
--------
pandas.DataFrame
Original dataframe with added 'Subject_Salience' column
"""
subject_scores = []
for _, row in df.iterrows():
img_path = row['Filename']
sentence = row['Sentence']
# Extract subject noun (assumes format "The X is ...")
# Extract word after "The " and before " is"
subject = sentence.split("The ")[1].split(" is")[0]
# Preprocess image and tokenize subject
img = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
text_tokenized = clip.tokenize([subject]).to(device)
# Compute similarity
with torch.no_grad():
logits_per_image, _ = model(img, text_tokenized)
similarity_score = logits_per_image.item()
subject_scores.append(similarity_score)
df_copy = df.copy()
df_copy['Subject_Salience'] = subject_scores
return df_copy
def compute_qwen_scores(df, model, tokenizer, streamer=None):
"""
Compute verification scores using Qwen-VL-Chat multimodal LLM.
Parameters:
-----------
df : pandas.DataFrame
DataFrame with 'Filename' and 'Sentence' columns
model : Qwen-VL-Chat model
Loaded Qwen model
tokenizer : AutoTokenizer
Qwen tokenizer
streamer : TextStreamer, optional
Streamer for real-time output
Returns:
--------
pandas.DataFrame
Original dataframe with added 'VLM_Score' and 'VLM_Response' columns
"""
import re
scores = []
responses = []
for idx, row in df.iterrows():
img_path = row['Filename']
sentence = row['Sentence']
# Create query for Qwen-VL-Chat
query = tokenizer.from_list_format([
{'image': img_path},
{'text': f'Rate how well this sentence describes the image: "{sentence}"\nScore from 1-10 (1=mismatch, 10=perfect match). Reply with just the number.'},
])
# Generate response
with torch.no_grad():
response, _ = model.chat(tokenizer, query=query, history=None, streamer=streamer)
# Extract numeric score
try:
match = re.search(r'(\d+(?:\.\d+)?)', response)
score = float(match.group(1)) if match else 5.0
score = min(10.0, max(1.0, score)) # Clamp to 1-10
except:
score = 5.0
scores.append(score)
responses.append(response)
df_copy = df.copy()
df_copy['VLM_Score'] = scores
df_copy['VLM_Response'] = responses
return df_copyimport os
CACHE_FILE = "./cached_scores.csv"
if os.path.exists(CACHE_FILE):
df_all = pd.read_csv(CACHE_FILE)
else:
# Compute CLIP similarities
df_unerg_clip = compute_clip_similarity(df_unerg, model_clip, preprocess, device)
df_unacc_clip = compute_clip_similarity(df_unacc, model_clip, preprocess, device)
# Compute subject salience scores
df_unerg_subj = compute_subject_salience(df_unerg, model_clip, preprocess, device)
df_unacc_subj = compute_subject_salience(df_unacc, model_clip, preprocess, device)
# Compute Qwen-VL scores
df_unerg_vlm = compute_qwen_scores(df_unerg, model_vlm, tokenizer_vlm, streamer=streamer)
df_unacc_vlm = compute_qwen_scores(df_unacc, model_vlm, tokenizer_vlm, streamer=streamer)
# Combine CLIP scores with VLM scores and subject salience
df_unerg_scored = df_unerg_clip.copy()
df_unerg_scored['Subject_Salience'] = df_unerg_subj['Subject_Salience']
df_unerg_scored['VLM_Score'] = df_unerg_vlm['VLM_Score']
df_unerg_scored['VLM_Response'] = df_unerg_vlm['VLM_Response']
df_unerg_scored['VerbType'] = 'Unergative'
df_unacc_scored = df_unacc_clip.copy()
df_unacc_scored['Subject_Salience'] = df_unacc_subj['Subject_Salience']
df_unacc_scored['VLM_Score'] = df_unacc_vlm['VLM_Score']
df_unacc_scored['VLM_Response'] = df_unacc_vlm['VLM_Response']
df_unacc_scored['VerbType'] = 'Unaccusative'
# Combine for analysis
df_all = pd.concat([df_unerg_scored, df_unacc_scored], ignore_index=True)
# Save to cache
df_all.to_csv(CACHE_FILE, index=False)
print(df_all.head())# Create comparison plot with all three metrics
fig, axes = plt.subplots(1, 3, figsize=(8, 3))
# CLIP full sentence results
sns.pointplot(data=df_all, x='VerbType', y='CLIP_Similarity',
hue='VerbType', palette=['#3498db', '#e74c3c'],
ax=axes[0], errorbar='ci', capsize=0.1,
linestyle='none', markers='o', legend=False)
sns.stripplot(data=df_all, x='VerbType', y='CLIP_Similarity',
color='black', alpha=0.5, size=8, ax=axes[0], jitter=0.2)
axes[0].set_xlabel('Verb Type', fontsize=14, fontweight='bold')
axes[0].set_ylabel('CLIP Similarity Score', fontsize=14, fontweight='bold')
axes[0].set_title('Full Sentence Similarity',
fontsize=16, fontweight='bold', pad=20)
for verb_type in ['Unergative', 'Unaccusative']:
mean_val = df_all[df_all['VerbType'] == verb_type]['CLIP_Similarity'].mean()
axes[0].text(0 if verb_type == 'Unergative' else 1, mean_val + 1,
f'M = {mean_val:.2f}', ha='center', fontsize=12, fontweight='bold')
# Subject salience results
sns.pointplot(data=df_all, x='VerbType', y='Subject_Salience',
hue='VerbType', palette=['#3498db', '#e74c3c'],
ax=axes[1], errorbar='ci', capsize=0.1,
linestyle='none', markers='o', legend=False)
sns.stripplot(data=df_all, x='VerbType', y='Subject_Salience',
color='black', alpha=0.5, size=8, ax=axes[1], jitter=0.2)
axes[1].set_xlabel('Verb Type', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Subject Salience Score', fontsize=14, fontweight='bold')
axes[1].set_title('Subject Noun Identifiability',
fontsize=16, fontweight='bold', pad=20)
for verb_type in ['Unergative', 'Unaccusative']:
mean_val = df_all[df_all['VerbType'] == verb_type]['Subject_Salience'].mean()
axes[1].text(0 if verb_type == 'Unergative' else 1, mean_val + 0.5,
f'M = {mean_val:.2f}', ha='center', fontsize=12, fontweight='bold')
# VLM results
sns.pointplot(data=df_all, x='VerbType', y='VLM_Score',
hue='VerbType', palette=['#3498db', '#e74c3c'],
ax=axes[2], errorbar='ci', capsize=0.1,
linestyle='none', markers='o', legend=False)
sns.stripplot(data=df_all, x='VerbType', y='VLM_Score',
color='black', alpha=0.5, size=8, ax=axes[2], jitter=0.2)
axes[2].set_xlabel('Verb Type', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Qwen-VL Match Score (1-10)', fontsize=14, fontweight='bold')
axes[2].set_title('Scene Verification (Qwen-VL)',
fontsize=16, fontweight='bold', pad=20)
for verb_type in ['Unergative', 'Unaccusative']:
mean_val = df_all[df_all['VerbType'] == verb_type]['VLM_Score'].mean()
axes[2].text(0 if verb_type == 'Unergative' else 1, mean_val + 0.3,
f'M = {mean_val:.2f}', ha='center', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('./model_comparison_plot.png', dpi=300, bbox_inches='tight')
plt.show()import torch
import pyro
import pyro.distributions as dist
from pyro.infer import MCMC, NUTS
# Prepare data for Pyro
# We'll center the scores and code VerbType numerically
df_pyro = df_all.copy()
df_pyro['VerbType_num'] = df_pyro['VerbType'].map({'Unergative': -0.5, 'Unaccusative': 0.5})
df_pyro['CLIP_centered'] = df_pyro['CLIP_Similarity'] - df_pyro['CLIP_Similarity'].mean()
df_pyro['Subject_centered'] = df_pyro['Subject_Salience'] - df_pyro['Subject_Salience'].mean()
vlm_score_tensor = torch.tensor(df_pyro['VLM_Score'].values, dtype=torch.long)
# Convert to tensors
verb_type_tensor = torch.tensor(df_pyro['VerbType_num'].values, dtype=torch.float32)
clip_tensor = torch.tensor(df_pyro['CLIP_centered'].values, dtype=torch.float32)
subject_tensor = torch.tensor(df_pyro['Subject_centered'].values, dtype=torch.float32)
# --- Model for CLIP Similarity ---
def clip_model(verb_type, obs=None):
intercept = pyro.sample('intercept', dist.Normal(0., 10.))
beta = pyro.sample('beta', dist.Normal(0., 10.))
sigma = pyro.sample('sigma', dist.HalfNormal(10.))
mu = intercept + beta * verb_type
with pyro.plate('data', len(verb_type)):
pyro.sample('obs', dist.Normal(mu, sigma), obs=obs)
# --- Model for Subject Salience ---
def subject_model(verb_type, obs=None):
intercept = pyro.sample('intercept', dist.Normal(0., 10.))
beta = pyro.sample('beta', dist.Normal(0., 10.))
sigma = pyro.sample('sigma', dist.HalfNormal(10.))
mu = intercept + beta * verb_type
with pyro.plate('data', len(verb_type)):
pyro.sample('obs', dist.Normal(mu, sigma), obs=obs)
# --- Model for VLM Score (Ordered Logistic) ---
k_categories = vlm_score_tensor.max().item() + 1
k_cutpoints = k_categories - 1
def vlm_model(verb_type, obs=None):
alpha = pyro.sample('alpha', dist.Normal(0., 10.))
beta = pyro.sample('beta', dist.Normal(0., 10.))
with pyro.plate("cutpoints_plate", k_cutpoints):
raw_cutpoints = pyro.sample('raw_cutpoints', dist.Normal(torch.arange(k_cutpoints).float(), 1.))
cutpoints = torch.sort(raw_cutpoints)[0]
latent_propensity = alpha + beta * verb_type
with pyro.plate('data', len(verb_type)):
pyro.sample('obs', dist.OrderedLogistic(latent_propensity, cutpoints), obs=obs)
# Run the MCMC samplers
mcmc_clip = MCMC(NUTS(clip_model), num_samples=2000, warmup_steps=1000)
mcmc_clip.run(verb_type_tensor, clip_tensor)
clip_samples = mcmc_clip.get_samples()
mcmc_subject = MCMC(NUTS(subject_model), num_samples=2000, warmup_steps=1000)
mcmc_subject.run(verb_type_tensor, subject_tensor)
subject_samples = mcmc_subject.get_samples()
mcmc_vlm = MCMC(NUTS(vlm_model), num_samples=2000, warmup_steps=1000, num_chains=1)
mcmc_vlm.run(verb_type_tensor, vlm_score_tensor)
vlm_samples = mcmc_vlm.get_samples()# Get posterior samples and print results
clip_beta_mean = clip_samples['beta'].mean().item()
clip_beta_hdi = torch.quantile(clip_samples['beta'], torch.tensor([0.025, 0.975]))
print(f"\nCLIP Similarity - Bayesian Regression:")
print(f" Beta (VerbType effect): {clip_beta_mean:.3f}")
print(f" 95% HDI: [{clip_beta_hdi[0]:.3f}, {clip_beta_hdi[1]:.3f}]")
print(f" P(beta < 0): {(clip_samples['beta'] < 0).float().mean():.3f}")
subject_beta_mean = subject_samples['beta'].mean().item()
subject_beta_hdi = torch.quantile(subject_samples['beta'], torch.tensor([0.025, 0.975]))
print(f"\nSubject Salience - Bayesian Regression:")
print(f" Beta (VerbType effect): {subject_beta_mean:.3f}")
print(f" 95% HDI: [{subject_beta_hdi[0]:.3f}, {subject_beta_hdi[1]:.3f}]")
print(f" P(beta < 0): {(subject_samples['beta'] < 0).float().mean():.3f}")
vlm_beta_mean = vlm_samples['beta'].mean().item()
vlm_beta_hdi = torch.quantile(vlm_samples['beta'], torch.tensor([0.025, 0.975]))
print(f"\nVLM Score - Ordered Logistic Regression:")
print(f" Beta (VerbType effect): {vlm_beta_mean:.3f}")
print(f" 95% HDI: [{vlm_beta_hdi[0]:.3f}, {vlm_beta_hdi[1]:.3f}]")
print(f" P(beta < 0): {(vlm_samples['beta'] < 0).float().mean():.3f}"import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Data dictionary from your MCMC samples
beta_data = {
'Full Scene (CLIP)': clip_samples['beta'].numpy(),
'Subject Salience (CLIP)': subject_samples['beta'].numpy(),
'Scene Verification (VLM)': vlm_samples['beta'].numpy()
}
# Adjust figure size for better vertical separation
fig, ax = plt.subplots(figsize=(8, 4))
sns.set_style("whitegrid", {'axes.grid': True, 'grid.color': '.95'})
labels = list(beta_data.keys())
colors = ['#3498db', '#9b59b6', '#e74c3c']
for i, label in enumerate(labels):
samples = beta_data[label]
mean_val = samples.mean()
# 1. Calculate multiple intervals for the "stacking" effect
hdi_95 = np.percentile(samples, [2.5, 97.5])
hdi_80 = np.percentile(samples, [10, 90])
hdi_50 = np.percentile(samples, [25, 75])
# 2. Plot the stacked lines (Bottom to Top: thinnest/widest first)
# 95% Interval - Thin
ax.hlines(i, hdi_95[0], hdi_95[1], color=colors[i], linewidth=1.5, alpha=0.4, zorder=1)
# 80% Interval - Medium
ax.hlines(i, hdi_80[0], hdi_80[1], color=colors[i], linewidth=5.0, alpha=0.7, zorder=2)
# 50% Interval - Thick
ax.hlines(i, hdi_50[0], hdi_50[1], color=colors[i], linewidth=10.0, alpha=1.0, zorder=3)
# 3. Plot the Mean point
ax.plot(mean_val, i, 'o', color='white', markersize=8, zorder=4)
# 4. Perfectly Aligned Statistics
p_dir = (samples < 0).mean() if mean_val < 0 else (samples > 0).mean()
prob_text = f"$P(\\beta {'<' if mean_val < 0 else '>' } 0) = {p_dir:.2f}$"
# Locked to y-coordinate 'i' and x-coordinate 3.0 (outside plot area)
ax.text(3.0, i, prob_text, va='center', ha='left',
fontsize=13, fontweight='bold', color=colors[i])
# 5. Descriptive Annotations (The "How to Read" Guide)
ax.axvline(x=0, color='black', linestyle='-', linewidth=1.5, alpha=0.6, zorder=0)
# Arrow pointing Left (Negative Beta)
ax.annotate('', xy=(-5, -1.0), xytext=(-0.5, -1.0),
arrowprops=dict(arrowstyle="->", color='gray', lw=1.5))
ax.text(-2.75, -1.4, "Lower Scores for\nUnaccusatives", ha='center', color='gray', fontweight='bold')
# Arrow pointing Right (Positive Beta)
ax.annotate('', xy=(2.5, -1.0), xytext=(0.5, -1.0),
arrowprops=dict(arrowstyle="->", color='gray', lw=1.5))
ax.text(1.5, -1.4, "Lower Scores for\nUnergatives", ha='center', color='gray', fontweight='bold')
# 6. Final Layout Polish
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels, fontweight='bold', fontsize=12)
ax.set_xlabel('Posterior Beta Weight (Unaccusative vs. Unergative)', fontsize=13, labelpad=45)
# Lock limits so text and arrows don't shift
ax.set_xlim(-6, 3)
ax.set_ylim(-1.5, len(labels) - 0.5)
sns.despine(left=True, bottom=True)
plt.subplots_adjust(right=0.75, bottom=0.2) # Make room for text on right and guide on bottom
plt.savefig('./model_pyro.png', dpi=300, bbox_inches='tight')
plt.show()