User:Polygnotus/CitationVerification
MiniCheck is a claim verification model by Bespoke Labs that is allegedly capable of validating claims against reference text.
Note that it currently only supports English.
I haven't really found a way to deal with paywalls other than overthrowing capitalism, it is possible a source supports a claim but not in the snippet that is visible. It would be cool if the user could supply the text of an article manually if it can't reach the article.
I should probably figure out a way to deal with multiple refs that each partially support 1 claim.
Currently I am using Trafilatura to extract the relevant text of a webpage and ignore the rest. I should probably add a step that asks Claude to get the most relevant text that might or might not support the claim.
I can probably use pdfplumber to extract text from PDFs. If the PDF is just scanned images I can use pdf2image and then OCR with pytesseract.
https://www.bespokelabs.ai/bespoke-minicheck
Python code
|
|---|
import os
import time
import re
import requests
from typing import List, Tuple, Optional
from bespokelabs import BespokeLabs
import anthropic
# ============================================================================
# CONFIGURATION
# ============================================================================
WIKIPEDIA_ARTICLES = [
"Mastercooks of Belgium",
"Leaked Mohammad Javad Zarif audiotape"
]
CLAIMS_PER_ARTICLE = 10
API_WAIT_TIME = 1.0 # seconds between API calls
MAX_RETRIES = 3
INITIAL_BACKOFF = 2.0 # seconds
# API Keys - REPLACE THESE WITH YOUR ACTUAL KEYS
BESPOKE_API_KEY = "your_bespoke_api_key_here"
CLAUDE_API_KEY = "your_anthropic_api_key_here"
# ============================================================================
# API CLIENTS
# ============================================================================
bl = BespokeLabs(auth_token=BESPOKE_API_KEY)
claude_client = anthropic.Anthropic(api_key=CLAUDE_API_KEY)
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def exponential_backoff(attempt: int) -> None:
"""Sleep with exponential backoff."""
wait_time = INITIAL_BACKOFF * (2 ** attempt)
print(f"Waiting {wait_time:.1f} seconds before retry...")
time.sleep(wait_time)
def strip_infobox(wiki_text: str) -> str:
"""Remove infobox from Wikipedia article text."""
# Remove infobox templates
pattern = r'\{\{Infobox[\s\S]*?\n\}\}'
cleaned = re.sub(pattern, '', wiki_text, flags=re.IGNORECASE)
return cleaned
def get_wikipedia_article(article_name: str) -> Optional[str]:
"""Fetch Wikipedia article content."""
url = "https://en.wikipedia.org/w/api.php"
params = {
'action': 'query',
'prop': 'revisions',
'rvprop': 'content',
'titles': article_name,
'format': 'json',
'rvslots': 'main'
}
headers = {
'User-Agent': 'FactCheckBot/1.0 (Educational Research Project)'
}
for attempt in range(MAX_RETRIES):
try:
time.sleep(API_WAIT_TIME)
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
pages = data['query']['pages']
page_id = list(pages.keys())[0]
if page_id == '-1':
print(f"Article '{article_name}' not found")
return None
content = pages[page_id]['revisions'][0]['slots']['main']['*']
return strip_infobox(content)
except Exception as e:
print(f"Error fetching Wikipedia article (attempt {attempt + 1}): {e}")
if attempt < MAX_RETRIES - 1:
exponential_backoff(attempt)
else:
return None
def get_claims_from_claude(article_text: str, num_claims: int) -> List[Tuple[str, str]]:
"""Extract claims and URLs from Wikipedia article using Claude."""
prompt = f"""in this wikipedia article, can you return a list of {num_claims} claims supported by {num_claims} urls. skip the infobox if a <ref> does not contain an url, then skip it and the claim it supports use this format: Claim || URL Do not return anything else. Do not make it a table. Just {num_claims} lines containing a claim, then || then the url
{article_text}"""
for attempt in range(MAX_RETRIES):
try:
time.sleep(API_WAIT_TIME)
message = claude_client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
messages=[
{"role": "user", "content": prompt}
]
)
response_text = message.content[0].text
claims = []
for line in response_text.strip().split('\n'):
if '||' in line:
parts = line.split('||', 1)
if len(parts) == 2:
claim = parts[0].strip()
url = parts[1].strip()
claims.append((claim, url))
return claims[:num_claims]
except Exception as e:
print(f"Error calling Claude API (attempt {attempt + 1}): {e}")
if attempt < MAX_RETRIES - 1:
exponential_backoff(attempt)
else:
return []
def fetch_url_content(url: str) -> Optional[str]:
"""Fetch content from URL and strip irrelevant parts."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
for attempt in range(MAX_RETRIES):
try:
time.sleep(API_WAIT_TIME)
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Basic text extraction - remove HTML tags
text = re.sub(r'<script[\s\S]*?</script>', '', response.text, flags=re.IGNORECASE)
text = re.sub(r'<style[\s\S]*?</style>', '', text, flags=re.IGNORECASE)
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'\s+', ' ', text)
# Limit content length
return text[:10000]
except Exception as e:
print(f"Error fetching URL {url} (attempt {attempt + 1}): {e}")
if attempt < MAX_RETRIES - 1:
exponential_backoff(attempt)
else:
return None
def check_claim_with_minicheck(claim: str, context: str) -> Optional[float]:
"""Verify claim against context using Bespoke MiniCheck."""
for attempt in range(MAX_RETRIES):
try:
time.sleep(API_WAIT_TIME)
response = bl.minicheck.factcheck.create(
claim=claim,
context=context
)
return response.support_prob
except Exception as e:
print(f"Error calling MiniCheck API (attempt {attempt + 1}): {e}")
if attempt < MAX_RETRIES - 1:
exponential_backoff(attempt)
else:
return None
# ============================================================================
# MAIN PROCESSING
# ============================================================================
def process_article(article_name: str) -> None:
"""Process a single Wikipedia article."""
print(f"\n{'=' * 80}")
print(f"Processing: {article_name}")
print(f"{'=' * 80}\n")
# Fetch Wikipedia article
article_text = get_wikipedia_article(article_name)
if not article_text:
print(f"Failed to fetch article: {article_name}")
return
# Extract claims using Claude
print(f"Extracting {CLAIMS_PER_ARTICLE} claims...")
claims = get_claims_from_claude(article_text, CLAIMS_PER_ARTICLE)
print(f"Found {len(claims)} claims\n")
# Process each claim
results = []
for i, (claim, url) in enumerate(claims, 1):
print(f"[{i}/{len(claims)}] Processing claim...")
print(f"Claim: {claim[:100]}...")
print(f"URL: {url}")
# Fetch URL content
context = fetch_url_content(url)
if not context:
print("Failed to fetch URL content\n")
continue
# Check with MiniCheck
support_prob = check_claim_with_minicheck(claim, context)
if support_prob is None:
print("Failed to verify claim\n")
continue
print(f"MiniCheck score: {support_prob}")
results.append((claim, url, support_prob))
print()
# Output results
print(f"\n{'=' * 80}")
print(f"RESULTS FOR: {article_name}")
print(f"{'=' * 80}\n")
for claim, url, score in results:
print(f"{claim} || {url} || {score}")
print()
def main():
"""Main execution function."""
print(f"Starting fact-checking process for {len(WIKIPEDIA_ARTICLES)} articles...")
for article in WIKIPEDIA_ARTICLES:
process_article(article)
if __name__ == "__main__":
main()
|
| Statement | Source | MiniCheck Score | Claude Assessment | Confidence | Explanation |
|---|---|---|---|---|---|
| In 1980, Pierre Romeyer established the association along with Pierre Wynant and Jacques Deluc, in order to create a Belgian professional body of Chefs, dedicated to defending Belgian Culinary tradition. | https://horecawebzine.com/en/the-mastercooks-of-belgium-nieuwe-topchefs/ | 0.5924 | Partially | High | The source confirms most elements but contains a factual discrepancy. Pierre Romeyer did establish the association in 1980 with Pierre Wynants (spelled "Wynants" not "Wynant") and Jacques Deluc. However, the founding was specifically motivated by Romeyer's rejection from the French association, not just a general desire to defend Belgian culinary tradition. |
| Pierre Romeyer was motivated to start the organization after being rejected by the French association of master chefs, which did not accept women or foreigners. | https://www.lesoir.be/167762/article/2018-07-12/pierre-romeyer-le-bocuse-belge-est-mort | 0.7978 | Yes | High | The source directly supports this claim. The text explicitly states that Pierre Romeyer founded the Belgian association in 1980 because he could not join the French equivalent which did not admit foreigners or women. |
| The founders originally named the association the Maîtres Cuisiniers (Master Chefs), but it was retitled the Master Cooks of Belgium (Les Maîtres Cuisiniers de Belgique). | https://horecawebzine.com/fr/livre-de-recettes-mastercooks-meilleures-recettes/ | 0.5923 | No | High | The source directly contradicts the claim. It was originally called "Les Maîtres Cuisiniers de Belgique" and later renamed to "The Mastercooks of Belgium®" four decades later - the opposite of what the claim states. |
| The organization's structure consists of 3 vice-presidents representing Flanders, Wallonia, and Brussels. | https://horecamagazine.be/fr/cedric-poncelet-nouveau-president-the-mastercooks-of-belgium/ | 0.8517 | Yes | High | The source explicitly identifies three vice-presidents by region: Patrick Meirsman for Brussels-Capital Region, Benoît Bourivain for Wallonia, and Félix Alen for Flanders. |
| The organization publishes a guide highlighting the bios of its members, including representatives in Spain, France, and the United States. | https://horecamagazine.be/mastercooks-of-belgium-the-guide-2018/ | 0.9044 | Yes | High | The source confirms the guide contains biographical information (photo, practical details, quotes) for members including ambassadors in France, Spain and the United States. |
| The guide describes the events, restaurants, and best cooks in Belgium. It contains recommendations from Gault & Millau and Michelin ratings. | https://brussels-express.eu/the-finest-chefs-in-the-kingdom-mastercooks-of-belgium/ | 0.4686 | Partially | High | The source confirms the guide describes restaurants and chefs, and explicitly states it includes recommendations from Michelin and Gault & Millau. However, the claim about describing "events" is only partially supported - the source mentions increasing awareness of upcoming events, not describing specific events. |
| In 2022, the Master Cooks received the Prince Alexandre of Belgium literary prize for their book, The Mastercooks of Belgium: L'excellence gastronomique belge. | https://histoiresroyales.fr/remise-prix-litteraire-princesse-lea-de-belgique-2022-gastronomie/ | 0.7978 | No | High | The Mastercooks received a "special jury prize" in 2022, not the main Prince Alexandre literary prize. Additionally, their book was titled "L'excellence gastronomique belge" only, not "The Mastercooks of Belgium: L'excellence gastronomique belge." |
| The Mastercooks participate in the television program Z-Mastercooks, which runs on the business channels Kanaal Z/Canal Z. | https://www.roularta.be/nl/over-roularta/persberichten/topchefs-de-tweede-reeks-z-mastercooks | 0.8667 | Yes | High | The source clearly confirms that "Z-Mastercooks" runs on Kanaal Z/Canal Z business channels and that Mastercooks members participate in the program. |
| The association has also chaired other competitions, such as the "Best Artisan Chef of Belgium" award (Meilleur Artisan-Cuisinier de Belgique). | https://www.ardenneweb.eu/reportages/2008/le_meilleur_artisan_cuisinier_de_belgique_les_7_fontaines_d_awenne_saint_hubert_a_l_ | 0.4375 | Partially | Medium | The source confirms the competition exists and was "organized" by the Master Chefs of Belgium association, but does not explicitly confirm they "chaired" it - only that they organized it. |
| Mastercooks collaborates with Carrefour Belgium to review products. | https://newsroom.carrefour.be/carrefour-celebre-10-ans-de-collaboration-avec-the-mastercooks-of-belgium-en-faveur-du-gout-et-de-la-qualite-de-ses-produits | 0.9739 | Yes | High | The source clearly states that Carrefour Belgium has been collaborating with The Mastercooks for 10 years, with chefs testing, validating and labeling products based on taste, texture, consistency and appearance. |
| Statement | Source | MiniCheck Score | Claude Assessment | Confidence | Explanation |
|---|---|---|---|---|---|
| In April 2021, more than three hours of audiotape was leaked from a seven-hour interview between economist Saeed Leylaz and Iranian foreign minister Mohammad Javad Zarif | https://www.npr.org/2021/04/26/990862781/in-leaked-audio-irans-foreign-minister-criticizes-influence-of-revolutionary-gua | 0.9524 | Yes | High | The source directly confirms all key elements: more than three hours of a seven-hour interview were leaked, involving Foreign Minister Mohammad Javad Zarif and economist Saeed Laylaz, in April 2021. |
| The tape was obtained by the Saudi-funded news channel Iran International and publicized by The New York Times | https://www.washingtonpost.com/politics/2021/04/27/john-kerry-iran-controversy-explained/ | 0.0052 | No | High | The source provides no information about how the tape was obtained or which news organizations publicized it. There is no mention of Iran International or The New York Times. |
| Critics of Zarif were calling for his resignation, saying that "he had threatened Iran's national security by revealing to the world the country's inner politics" | https://www.cnn.com/2021/04/26/middleeast/iran-zarif-revolutionary-guards-audio-leak-intl/index.html | 0.0052 | No | High | The source does not mention critics calling for Zarif's resignation or anyone saying he threatened Iran's national security. The article only mentions a foreign ministry spokesman saying quotes were taken out of context. |
| Iran's president Hassan Rouhani ordered an investigation to identify who leaked the tape | https://www.hindustantimes.com/world-news/iran-orders-probe-into-leaked-zarif-audio-conspiracy-101619520977150.html | 0.9524 | Yes | High | The source directly supports this, explicitly stating that President Hassan Rouhani ordered the intelligence ministry to investigate and identify who leaked the recording. |
| On May 2, 2021, in an Instagram post, Zarif apologized for the remarks that he made against Soleimani | https://www.jpost.com/middle-east/zarif-apologizes-for-leaked-comments-about-soleimani-666974 | 0.0373 | Partially | Medium | The source confirms Zarif apologized for leaked comments about Soleimani on May 2, 2021, but does not explicitly state the apology was made "in an Instagram post" - the specific platform is not mentioned. |
| Kerry was not recorded speaking on the tape, and the date of the recording is unknown | https://people.com/politics/john-kerry-denies-he-shared-info-about-israeli-military-operations-with-iran/ | 0.0421 | Partially | Medium | The source confirms Kerry was not recorded speaking (it was Zarif's account of what Kerry allegedly told him), but provides no information about whether the recording date is unknown. |
| On September 27, 2013, Kerry met with Zarif during the P5+1 and Iran summit, which eventually led to the JCPOA nuclear agreement | https://news.yahoo.com/diplomats-hail-iranian-attitude-nuke-talks-220409043.html | 0.0180 | Yes | High | The source confirms all key elements: Kerry and Zarif met on September 27, 2013, at a P5+1 meeting aimed at resolving Iran's nuclear dispute, with the goal of reaching a deal within a year (the process that led to JCPOA). |
| Israeli Prime Minister Benjamin Netanyahu revealed in July 2017 that Israeli airstrikes had targeted Iranian-backed Hezbollah-bound convoys in Syria "dozens of times" | https://www.theatlantic.com/news/archive/2017/07/netanyahu-admits-israel-struck-iranian-fighters-dozens-of-times/534261/ | 0.9668 | Yes | High | The source directly supports the claim, stating Netanyahu revealed at a meeting on July 19, 2017 that Israeli strikes had targeted Iranian weapons shipments to Hezbollah "dozens of times." |
Extended content
| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Verification Results for Labubu[edit]
Verification Results for Auditorium of the Old Burgtheater[edit]
|