import pandas as pd
Data Extraction/ Scraping
= pd.read_csv("federal_cty_harm.csv") df
df.head()
county_code | election_year | state | eligible_voters | number_voters | valid_votes | invalid_votes | turnout | cdu | csu | ... | volt | zentrum | area | population | flag_unsuccessful_naive_merge | total_votes | cdu_csu | far_right | far_left | far_left_w_linke | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | 1990 | 1 | 69563 | 50485 | 50036 | 449 | 0.725745 | 0.370533 | 0.0 | ... | 0.0 | 0.0 | 56.36 | 86.6 | NaN | 50036 | 0.370533 | NaN | NaN | NaN |
1 | 1001 | 1994 | 1 | 68987 | 52379 | 51823 | 556 | 0.759259 | 0.344191 | 0.0 | ... | 0.0 | 0.0 | 56.44 | 87.9 | NaN | 51823 | 0.344191 | NaN | NaN | NaN |
2 | 1001 | 1998 | 1 | 65755 | 50560 | 49862 | 698 | 0.768915 | 0.291565 | 0.0 | ... | 0.0 | 0.0 | 56.44 | 84.7 | NaN | 49862 | 0.291565 | NaN | NaN | NaN |
3 | 1001 | 2002 | 1 | 65740 | 49054 | 48553 | 501 | 0.746182 | 0.295800 | 0.0 | ... | 0.0 | 0.0 | 56.38 | 84.7 | NaN | 48553 | 0.295800 | NaN | NaN | NaN |
4 | 1001 | 2005 | 1 | 66970 | 49002 | 48235 | 767 | 0.731701 | 0.288380 | 0.0 | ... | 0.0 | 0.0 | 56.38 | 86.1 | NaN | 48235 | 0.288380 | NaN | NaN | NaN |
5 rows × 120 columns
df.dtypes
county_code int64
election_year int64
state int64
eligible_voters int64
number_voters int64
...
total_votes int64
cdu_csu float64
far_right float64
far_left float64
far_left_w_linke float64
Length: 120, dtype: object
# import requests
# from bs4 import BeautifulSoup
# elections_data = requests.get("https://www.wahlrecht.de/ergebnisse/hamburg.htm").text
# print(elections_data)
# soup = BeautifulSoup(elections_data)
# soup.head.title
# hh_table = soup.select("table")
# len(hh_table)
# hh_table[1]
# hh_table = soup.select("table.border")
# hh_tables = hh_table[0]
# hh_tables.text[:200]
# row_els = hh_tables.select("tbody tr")
# len(row_els)
# row_els[1]
# [cell.text for cell in row_els[0].select("td")]
# import nest_asyncio
# import asyncio
# from playwright.async_api import async_playwright
# import pandas as pd
# from bs4 import BeautifulSoup
# # Enable async in Jupyter
# nest_asyncio.apply()
# BASE_URL = "https://www.wahlrecht.de/ergebnisse/index.htm"
# HOST_URL = "https://www.wahlrecht.de/ergebnisse/"
# async def get_links_from_base():
# """Fetch the base page and extract all links from the second table with class='border'."""
# async with async_playwright() as p:
# browser = await p.chromium.launch(headless=True)
# page = await browser.new_page()
# await page.goto(BASE_URL)
# # Get the HTML of the base page
# base_content = await page.content()
# soup = BeautifulSoup(base_content, "html.parser")
# # Find all tables with class='border'
# tables = soup.find_all("table", class_="border")
# if len(tables) < 2:
# print("Could not find the second table with class='border'.")
# await browser.close()
# return []
# # Get the second table
# second_table = tables[1]
# # In the second table, find the FIRST <tbody>
# tbody = second_table.find("tbody")
# if not tbody:
# print("No <tbody> found in the second table.")
# await browser.close()
# return []
# # Extract links from the rows
# links = []
# for tr in tbody.find_all("tr"):
# # Check each cell for <a> tags
# for td in tr.find_all("td"):
# a_tag = td.find("a", href=True)
# if a_tag:
# relative_href = a_tag["href"]
# # Convert relative URL to absolute
# full_url = HOST_URL + relative_href
# link_text = a_tag.get_text(strip=True)
# links.append((link_text, full_url))
# await browser.close()
# return links
# def parse_complex_table(table_soup, source_name, table_index):
# """
# Parse a table with a two-row <thead> structure that represents election results.
# Expected structure (e.g., on thueringen.htm):
# <thead>
# <tr>
# <th rowspan="2"></th> <!-- empty cell: becomes "Party" -->
# <th class="jahr" colspan="2">1990</th>
# <th class="jahr" colspan="2">1994</th>
# ... etc.
# </tr>
# <tr>
# <th>%</th>
# <th>Sitze</th>
# <th>%</th>
# <th>Sitze</th>
# ... etc.
# </tr>
# </thead>
# The final MultiIndex for columns will be:
# [("Party", ""), ("1990", "%"), ("1990", "Sitze"), ("1994", "%"), ("1994", "Sitze"), …]
# The <tbody> should have rows with exactly (1 + 2*N) cells.
# Returns a DataFrame with MultiIndex columns or None if the structure is not as expected.
# """
# thead = table_soup.find("thead")
# if not thead:
# return None
# header_rows = thead.find_all("tr")
# if len(header_rows) != 2:
# return None # Expect exactly two header rows
# # ----- Row 0: Top-level headers -----
# # The first <th> (with rowspan="2") should be empty and becomes the Party column header.
# row0_cells = header_rows[0].find_all("th")
# if not row0_cells or len(row0_cells) < 2:
# return None
# party_header = row0_cells[0].get_text(strip=True) or "Party"
# top_headers = [party_header] # first column header
# # For the remaining cells, repeat each cell's text according to its colspan.
# for cell in row0_cells[1:]:
# try:
# colspan = int(cell.get("colspan", "1"))
# except ValueError:
# colspan = 1
# text = cell.get_text(strip=True)
# for _ in range(colspan):
# top_headers.append(text)
# # ----- Row 1: Bottom-level headers (sub-headers) -----
# row1_cells = header_rows[1].find_all("th")
# if not row1_cells:
# return None
# # For the Party column, we assign an empty string.
# bottom_headers = [""]
# for cell in row1_cells:
# bottom_headers.append(cell.get_text(strip=True))
# if len(top_headers) != len(bottom_headers):
# return None
# # Build the MultiIndex for the DataFrame columns.
# columns = pd.MultiIndex.from_arrays([top_headers, bottom_headers])
# # ----- Parse the <tbody> -----
# tbody = table_soup.find("tbody")
# if not tbody:
# return None
# data_rows = []
# for tr in tbody.find_all("tr"):
# cells = tr.find_all(["td", "th"])
# row_values = [cell.get_text(strip=True) for cell in cells]
# if row_values:
# data_rows.append(row_values)
# # Only accept rows that match the number of header columns.
# expected_cols = len(columns)
# valid_rows = [row for row in data_rows if len(row) == expected_cols]
# if not valid_rows:
# return None
# df = pd.DataFrame(valid_rows, columns=columns)
# df.insert(0, "Table_Index", table_index)
# df.insert(0, "Source_Page", source_name)
# return df
# def parse_complex_table(table_soup, source_name, table_index):
# """
# Parse a table with a two-row <thead> structure that represents election results,
# building a MultiIndex header as follows:
# Expected <thead> structure:
# <thead>
# <tr>
# <th rowspan="2"></th> <!-- empty cell becomes "Party" -->
# <th class="jahr" colspan="2">1990</th>
# <th class="jahr" colspan="2">1994</th>
# ... etc.
# </tr>
# <tr>
# <th>%</th>
# <th>Sitze</th>
# <th>%</th>
# <th>Sitze</th>
# ... etc.
# </tr>
# </thead>
# The final MultiIndex for columns will be:
# [("Party", ""), ("1990", "%"), ("1990", "Sitze"), ("1994", "%"), ("1994", "Sitze"), …]
# The data rows may be split across multiple <tbody> tags.
# Returns a DataFrame with MultiIndex columns or None if the structure is not as expected.
# """
# # ----- Parse the header -----
# thead = table_soup.find("thead")
# if not thead:
# return None # No <thead> found; skip this table
# header_rows = thead.find_all("tr")
# if len(header_rows) != 2:
# return None # Expect exactly two header rows
# # Row 0: Top-level headers
# row0_cells = header_rows[0].find_all("th")
# if not row0_cells or len(row0_cells) < 2:
# return None
# party_header = row0_cells[0].get_text(strip=True) or "Party"
# top_headers = [party_header]
# for cell in row0_cells[1:]:
# try:
# colspan = int(cell.get("colspan", "1"))
# except ValueError:
# colspan = 1
# text = cell.get_text(strip=True)
# for _ in range(colspan):
# top_headers.append(text)
# # Row 1: Sub-headers (lower-level headers)
# row1_cells = header_rows[1].find_all("th")
# if not row1_cells:
# return None
# bottom_headers = [""]
# for cell in row1_cells:
# bottom_headers.append(cell.get_text(strip=True))
# if len(top_headers) != len(bottom_headers):
# return None
# # Create the MultiIndex columns
# columns = pd.MultiIndex.from_arrays([top_headers, bottom_headers])
# # ----- Parse the data from all <tbody> tags -----
# tbodys = table_soup.find_all("tbody")
# if not tbodys:
# return None
# data_rows = []
# for tbody in tbodys:
# for tr in tbody.find_all("tr"):
# cells = tr.find_all(["td", "th"])
# row_values = [cell.get_text(strip=True) for cell in cells]
# if row_values:
# data_rows.append(row_values)
# # Only keep rows that have the expected number of columns.
# expected_cols = len(columns)
# valid_rows = [row for row in data_rows if len(row) == expected_cols]
# if not valid_rows:
# return None
# df = pd.DataFrame(valid_rows, columns=columns)
# df.insert(0, "Table_Index", table_index)
# df.insert(0, "Source_Page", source_name)
# return df
# async def scrape_page_tables(url, name):
# """
# Scrape all <table> tags from a given URL.
# We'll try parse_complex_table() first.
# If that fails (None), we'll fallback to the naive approach.
# """
# async with async_playwright() as p:
# browser = await p.chromium.launch(headless=True)
# page = await browser.new_page()
# await page.goto(url)
# # Get page HTML
# html = await page.content()
# soup = BeautifulSoup(html, "html.parser")
# # Find all <table>
# tables = soup.find_all("table")
# dfs = []
# for i, table in enumerate(tables):
# # Try complex header parsing
# df_complex = parse_complex_table(table, name, i) # <-- uses the new version
# if df_complex is not None:
# # If we successfully parsed a complex header, use that
# dfs.append(df_complex)
# else:
# # Fallback: naive approach
# rows = table.find_all("tr")
# data = []
# # Attempt to detect headers in the first row if it has <th>
# if rows and rows[0].find("th"):
# headers = [th.get_text(strip=True) for th in rows[0].find_all("th")]
# data_start_idx = 1
# else:
# headers = None
# data_start_idx = 0
# # Extract rows
# for row in rows[data_start_idx:]:
# cols = [td.get_text(strip=True) for td in row.find_all("td")]
# if cols:
# data.append(cols)
# max_cols = max((len(r) for r in data), default=0)
# if headers and len(headers) != max_cols:
# headers = None
# if data:
# df = pd.DataFrame(data, columns=headers if headers else None)
# df.insert(0, "Table_Index", i)
# df.insert(0, "Source_Page", name)
# dfs.append(df)
# await browser.close()
# return dfs
# # ... [rest of your code remains unchanged]
# async def main():
# """Main flow: get links, then scrape each page's tables."""
# links = await get_links_from_base()
# if not links:
# print("No links found from the base page.")
# return
# print(f"Found {len(links)} links from base page.")
# all_results = {}
# for link_text, link_url in links:
# print(f"\nScraping: {link_text} -> {link_url}")
# page_dfs = await scrape_page_tables(link_url, link_text)
# if page_dfs:
# all_results[link_text] = page_dfs
# # Save/Display only the tables that meet our column threshold
# min_columns = 6 # change this number as appropriate for your complex tables
# for state_name, dfs in all_results.items():
# for idx, df in enumerate(dfs):
# if df.shape[1] < min_columns:
# # Skip tables that are too "simple"
# continue
# print(f"\nDataFrame from {state_name} (Table {idx}):")
# display(df.head())
# csv_name = f"{state_name.replace(' ', '_')}_table_{idx}.csv"
# df.to_csv(csv_name, index=False)
# print(f"Saved: {csv_name}")
# # Run in Jupyter
# await main()
Data Transformation & Cleaning
import glob
# Path to your CSV folder
= "state_election_results/*.csv"
csv_folder
# Find all CSV files
= glob.glob(csv_folder)
files
= []
all_long_dfs
for file in files:
= pd.read_csv(file)
df
# The first 3 columns are "Source_Page", "Table_Index", "Party"
= df.columns[:3] # e.g. ['Source_Page', 'Table_Index', 'Party']
fixed_cols # The rest are the repeated year columns
= df.columns[3:]
repeated_years
# Rename the repeated columns in pairs:
# e.g. "1952", "1952" -> "1952_%", "1952_Sitze"
= list(fixed_cols) # start with the first 3 columns as-is
new_cols
# Make sure we have an even number of columns in repeated_years
# If it's odd, there's a leftover column you might need to handle or skip.
if len(repeated_years) % 2 != 0:
print(f"Warning: {file} has an odd number of repeated columns. Check manually!")
# Build new column names in pairs
for i in range(0, len(repeated_years), 2):
= repeated_years[i]
year_col_1 # The second column in the pair:
if i+1 < len(repeated_years):
= repeated_years[i+1]
year_col_2 else:
= None
year_col_2
# We assume both are the same year label (like "1952", "1952"),
# so we can get the year from the first
= year_col_1
year_label
# Create new names
f"{year_label}_%")
new_cols.append(f"{year_label}_Sitze")
new_cols.append(
# Assign the new column names to df
= new_cols
df.columns
# Now "df" has columns like:
# Source_Page, Table_Index, Party, 1952_%, 1952_Sitze, 1956_%, 1956_Sitze, ...
# Melt (unpivot) from wide to long
# id_vars = the columns we don't want to melt
= ["Source_Page", "Table_Index", "Party"]
id_vars = df.columns[len(id_vars):] # all the year_% and year_Sitze columns
value_vars
= pd.melt(
long_df
df,=id_vars,
id_vars=value_vars,
value_vars="Year_Stat",
var_name="Value"
value_name
)
# "Year_Stat" might look like "1952_%", "1952_Sitze"
# Split into "Year" and "StatType"
"Year", "StatType"]] = long_df["Year_Stat"].str.split("_", expand=True)
long_df[[
# Keep only the relevant columns
= long_df[["Source_Page", "Party", "Year", "StatType", "Value"]]
long_df
# Append to list
all_long_dfs.append(long_df)
# Concatenate all long dataframes
= pd.concat(all_long_dfs, ignore_index=True)
full_long_df = full_long_df.columns.str.lower()
full_long_df.columns full_long_df.columns
Index(['source_page', 'party', 'year', 'stattype', 'value'], dtype='object')
full_long_df
source_page | party | year | stattype | value | |
---|---|---|---|---|---|
0 | NaN | NaN | 1946 | % | % |
1 | Hessen | Wahlbeteiligung | 1946 | % | 73,2 |
2 | Hessen | CDU | 1946 | % | 30,9 |
3 | Hessen | SPD | 1946 | % | 42,7 |
4 | Hessen | FDP | 1946 | % | 15,7 |
... | ... | ... | ... | ... | ... |
8243 | Schleswig-Holstein | DP³ | 2022 | Sitze | – |
8244 | Schleswig-Holstein | KPD/DKP¹ | 2022 | Sitze | – |
8245 | Schleswig-Holstein | REP | 2022 | Sitze | – |
8246 | Schleswig-Holstein | DVU | 2022 | Sitze | – |
8247 | Schleswig-Holstein | Sonstige | 2022 | Sitze | – |
8248 rows × 5 columns
"source_page"].isna()] full_long_df[full_long_df[
source_page | party | year | stattype | value | |
---|---|---|---|---|---|
0 | NaN | NaN | 1946 | % | % |
14 | NaN | NaN | 1946 | Sitze | Sitze |
28 | NaN | NaN | 1950 | % | % |
42 | NaN | NaN | 1950 | Sitze | Sitze |
56 | NaN | NaN | 1954 | % | % |
... | ... | ... | ... | ... | ... |
8163 | NaN | NaN | 2012 | Sitze | Sitze |
8180 | NaN | NaN | 2017 | % | % |
8197 | NaN | NaN | 2017 | Sitze | Sitze |
8214 | NaN | NaN | 2022 | % | % |
8231 | NaN | NaN | 2022 | Sitze | Sitze |
504 rows × 5 columns
= full_long_df.dropna(subset=["source_page"]).reset_index(drop=True)
cleaned_full_long_df cleaned_full_long_df
source_page | party | year | stattype | value | |
---|---|---|---|---|---|
0 | Hessen | Wahlbeteiligung | 1946 | % | 73,2 |
1 | Hessen | CDU | 1946 | % | 30,9 |
2 | Hessen | SPD | 1946 | % | 42,7 |
3 | Hessen | FDP | 1946 | % | 15,7 |
4 | Hessen | GRÜNE | 1946 | % | – |
... | ... | ... | ... | ... | ... |
7739 | Schleswig-Holstein | DP³ | 2022 | Sitze | – |
7740 | Schleswig-Holstein | KPD/DKP¹ | 2022 | Sitze | – |
7741 | Schleswig-Holstein | REP | 2022 | Sitze | – |
7742 | Schleswig-Holstein | DVU | 2022 | Sitze | – |
7743 | Schleswig-Holstein | Sonstige | 2022 | Sitze | – |
7744 rows × 5 columns
= cleaned_full_long_df.rename(columns={'source_page': 'state'})
cleaned_full_long_df cleaned_full_long_df
state | party | year | stattype | value | |
---|---|---|---|---|---|
0 | Hessen | Wahlbeteiligung | 1946 | % | 73,2 |
1 | Hessen | CDU | 1946 | % | 30,9 |
2 | Hessen | SPD | 1946 | % | 42,7 |
3 | Hessen | FDP | 1946 | % | 15,7 |
4 | Hessen | GRÜNE | 1946 | % | – |
... | ... | ... | ... | ... | ... |
7739 | Schleswig-Holstein | DP³ | 2022 | Sitze | – |
7740 | Schleswig-Holstein | KPD/DKP¹ | 2022 | Sitze | – |
7741 | Schleswig-Holstein | REP | 2022 | Sitze | – |
7742 | Schleswig-Holstein | DVU | 2022 | Sitze | – |
7743 | Schleswig-Holstein | Sonstige | 2022 | Sitze | – |
7744 rows × 5 columns
# Create a mask for rows where stattype is '%'
= cleaned_full_long_df['stattype'] == '%'
mask
# Replace comma with period and convert to numeric
'value'] = cleaned_full_long_df.loc[mask, 'value'].str.replace(',', '.')
cleaned_full_long_df.loc[mask, 'value'] = pd.to_numeric(cleaned_full_long_df.loc[mask, 'value'], errors='coerce') / 100 cleaned_full_long_df.loc[mask,
# Filter for voter turnout rows
= cleaned_full_long_df[cleaned_full_long_df['party'] == 'Wahlbeteiligung'].copy()
turnout_df
# Ensure the 'year' column is numeric so that sorting works correctly.
'year'] = pd.to_numeric(turnout_df['year'], errors='coerce')
turnout_df[
# Pivot the data so that each (state, year) row gets both "%" and "Sitze" values
= turnout_df.pivot_table(
turnout_table =["state", "year"],
index="stattype",
columns="value",
values="first" # if there are duplicates, you can use "mean" or another aggregator
aggfunc
).reset_index()
# Remove the name for the columns index and rename the turnout columns for clarity
= None
turnout_table.columns.name = turnout_table.rename(columns={"%": "Turnout_Percentage", "Sitze": "Turnout_Sitze"})
turnout_table
# Optionally, sort the table by state and year
= turnout_table.sort_values(by=["state", "year"])
turnout_table
turnout_table.head()
state | year | Turnout_Percentage | Turnout_Sitze | |
---|---|---|---|---|
0 | Baden-Württemberg | 1952 | 0.637 | 121 |
1 | Baden-Württemberg | 1956 | 0.703 | 120 |
2 | Baden-Württemberg | 1960 | 0.59 | 121 |
3 | Baden-Württemberg | 1964 | 0.677 | 120 |
4 | Baden-Württemberg | 1968 | 0.7 | 127 |
= cleaned_full_long_df[cleaned_full_long_df["party"] != "Wahlbeteiligung"].reset_index(drop=True) cleaned_full_long_df
cleaned_full_long_df
state | party | year | stattype | value | |
---|---|---|---|---|---|
0 | Hessen | CDU | 1946 | % | 0.309 |
1 | Hessen | SPD | 1946 | % | 0.427 |
2 | Hessen | FDP | 1946 | % | 0.157 |
3 | Hessen | GRÜNE | 1946 | % | NaN |
4 | Hessen | DIE LINKE | 1946 | % | NaN |
... | ... | ... | ... | ... | ... |
7515 | Schleswig-Holstein | DP³ | 2022 | Sitze | – |
7516 | Schleswig-Holstein | KPD/DKP¹ | 2022 | Sitze | – |
7517 | Schleswig-Holstein | REP | 2022 | Sitze | – |
7518 | Schleswig-Holstein | DVU | 2022 | Sitze | – |
7519 | Schleswig-Holstein | Sonstige | 2022 | Sitze | – |
7520 rows × 5 columns
"party"]=="Wahlbeteiligung"] cleaned_full_long_df[cleaned_full_long_df[
state | party | year | stattype | value |
---|
# cleaned_full_long_df.to_csv("test.csv", index = False)
= cleaned_full_long_df[cleaned_full_long_df["stattype"]=="%"].reset_index(drop = True)
percent_votes
percent_votes
# percent_votes.to_csv("percentage_votes.csv", index = False)
state | party | year | stattype | value | |
---|---|---|---|---|---|
0 | Hessen | CDU | 1946 | % | 0.309 |
1 | Hessen | SPD | 1946 | % | 0.427 |
2 | Hessen | FDP | 1946 | % | 0.157 |
3 | Hessen | GRÜNE | 1946 | % | NaN |
4 | Hessen | DIE LINKE | 1946 | % | NaN |
... | ... | ... | ... | ... | ... |
3755 | Schleswig-Holstein | DP³ | 2022 | % | NaN |
3756 | Schleswig-Holstein | KPD/DKP¹ | 2022 | % | NaN |
3757 | Schleswig-Holstein | REP | 2022 | % | NaN |
3758 | Schleswig-Holstein | DVU | 2022 | % | NaN |
3759 | Schleswig-Holstein | Sonstige | 2022 | % | 0.038 |
3760 rows × 5 columns
= cleaned_full_long_df[cleaned_full_long_df["stattype"]=="Sitze"].reset_index(drop = True)
seats_won
seats_won# seats_won.to_csv("seats_won.csv", index = False)
state | party | year | stattype | value | |
---|---|---|---|---|---|
0 | Hessen | CDU | 1946 | Sitze | 28 |
1 | Hessen | SPD | 1946 | Sitze | 38 |
2 | Hessen | FDP | 1946 | Sitze | 14 |
3 | Hessen | GRÜNE | 1946 | Sitze | – |
4 | Hessen | DIE LINKE | 1946 | Sitze | – |
... | ... | ... | ... | ... | ... |
3755 | Schleswig-Holstein | DP³ | 2022 | Sitze | – |
3756 | Schleswig-Holstein | KPD/DKP¹ | 2022 | Sitze | – |
3757 | Schleswig-Holstein | REP | 2022 | Sitze | – |
3758 | Schleswig-Holstein | DVU | 2022 | Sitze | – |
3759 | Schleswig-Holstein | Sonstige | 2022 | Sitze | – |
3760 rows × 5 columns