This script extracts structured data from the infobox of a Wikipedia page (in this case, the UEFA Champions League 2022/23 page), and exports the content into a clean, readable CSV file.
Data: UEFA Champions League
# shebang
#!/usr/bin/env python
# coding: utf-8
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
# Define the Wikipedia URL
url = "https://de.wikipedia.org/wiki/UEFA_Champions_League_2022/23"
# Send HTTP GET request and parse HTML
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Locate the infobox table by its CSS class
table = soup.find('table', {'class': 'toccolours infobox toptextcells float-right'})
# Extract the title of the infobox
data = {}
title = table.find('th', {'colspan': '2'})
if title:
data['Title'] = title.get_text().strip()
# Extract all table rows
rows = table.find_all('tr')
# Parse each row and extract key-value pairs
for row in rows:
td_elements = row.find_all('td')
if len(td_elements) == 2:
key_td, value_td = td_elements
if key_td.has_attr('style'):
key = key_td.get_text().strip()
value = value_td.get_text().strip()
data[key] = value
# Convert data to a DataFrame and save as CSV
df = pd.DataFrame(list(data.items()), columns=["Key", "Value"])
df.to_csv("uefa_champions_league_2022_23_info.csv", index=False)
# Optional: reload and clean up line breaks
df = pd.read_csv("uefa_champions_league_2022_23_info.csv")
df['Value'] = df['Value'].apply(lambda x: ' '.join(str(x).splitlines()).strip())
# Save the cleaned version
cleaned_file_path = "cleaned_uefa_champions_league_2022_23_info.csv"
df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned data saved to {cleaned_file_path}")