Data Extraction/ Scraping

import pandas as pd
df = pd.read_csv("federal_cty_harm.csv")
df.head()
county_code election_year state eligible_voters number_voters valid_votes invalid_votes turnout cdu csu ... volt zentrum area population flag_unsuccessful_naive_merge total_votes cdu_csu far_right far_left far_left_w_linke
0 1001 1990 1 69563 50485 50036 449 0.725745 0.370533 0.0 ... 0.0 0.0 56.36 86.6 NaN 50036 0.370533 NaN NaN NaN
1 1001 1994 1 68987 52379 51823 556 0.759259 0.344191 0.0 ... 0.0 0.0 56.44 87.9 NaN 51823 0.344191 NaN NaN NaN
2 1001 1998 1 65755 50560 49862 698 0.768915 0.291565 0.0 ... 0.0 0.0 56.44 84.7 NaN 49862 0.291565 NaN NaN NaN
3 1001 2002 1 65740 49054 48553 501 0.746182 0.295800 0.0 ... 0.0 0.0 56.38 84.7 NaN 48553 0.295800 NaN NaN NaN
4 1001 2005 1 66970 49002 48235 767 0.731701 0.288380 0.0 ... 0.0 0.0 56.38 86.1 NaN 48235 0.288380 NaN NaN NaN

5 rows × 120 columns

df.dtypes
county_code           int64
election_year         int64
state                 int64
eligible_voters       int64
number_voters         int64
                     ...   
total_votes           int64
cdu_csu             float64
far_right           float64
far_left            float64
far_left_w_linke    float64
Length: 120, dtype: object
# import requests
# from bs4 import BeautifulSoup
# elections_data = requests.get("https://www.wahlrecht.de/ergebnisse/hamburg.htm").text
# print(elections_data)
# soup = BeautifulSoup(elections_data)
# soup.head.title
# hh_table = soup.select("table")
# len(hh_table)
# hh_table[1]
# hh_table = soup.select("table.border")
# hh_tables = hh_table[0]
# hh_tables.text[:200]
# row_els = hh_tables.select("tbody tr")
# len(row_els)
# row_els[1]
# [cell.text for cell in row_els[0].select("td")]
    # import nest_asyncio
    # import asyncio
    # from playwright.async_api import async_playwright
    # import pandas as pd
    # from bs4 import BeautifulSoup
# # Enable async in Jupyter
# nest_asyncio.apply()

# BASE_URL = "https://www.wahlrecht.de/ergebnisse/index.htm"
# HOST_URL = "https://www.wahlrecht.de/ergebnisse/"

# async def get_links_from_base():
#     """Fetch the base page and extract all links from the second table with class='border'."""
#     async with async_playwright() as p:
#         browser = await p.chromium.launch(headless=True)
#         page = await browser.new_page()
#         await page.goto(BASE_URL)
        
#         # Get the HTML of the base page
#         base_content = await page.content()
#         soup = BeautifulSoup(base_content, "html.parser")
        
#         # Find all tables with class='border'
#         tables = soup.find_all("table", class_="border")
        
#         if len(tables) < 2:
#             print("Could not find the second table with class='border'.")
#             await browser.close()
#             return []
        
#         # Get the second table
#         second_table = tables[1]
        
#         # In the second table, find the FIRST <tbody>
#         tbody = second_table.find("tbody")
#         if not tbody:
#             print("No <tbody> found in the second table.")
#             await browser.close()
#             return []
        
#         # Extract links from the rows
#         links = []
#         for tr in tbody.find_all("tr"):
#             # Check each cell for <a> tags
#             for td in tr.find_all("td"):
#                 a_tag = td.find("a", href=True)
#                 if a_tag:
#                     relative_href = a_tag["href"]
#                     # Convert relative URL to absolute
#                     full_url = HOST_URL + relative_href
#                     link_text = a_tag.get_text(strip=True)
#                     links.append((link_text, full_url))
        
#         await browser.close()
#         return links
# def parse_complex_table(table_soup, source_name, table_index):
#     """
#     Parse a table with a two-row <thead> structure that represents election results.
    
#     Expected structure (e.g., on thueringen.htm):
#       <thead>
#         <tr>
#           <th rowspan="2"></th>         <!-- empty cell: becomes "Party" -->
#           <th class="jahr" colspan="2">1990</th>
#           <th class="jahr" colspan="2">1994</th>
#           ... etc.
#         </tr>
#         <tr>
#           <th>%</th>
#           <th>Sitze</th>
#           <th>%</th>
#           <th>Sitze</th>
#           ... etc.
#         </tr>
#       </thead>
      
#     The final MultiIndex for columns will be:
#       [("Party", ""), ("1990", "%"), ("1990", "Sitze"), ("1994", "%"), ("1994", "Sitze"), …]
      
#     The <tbody> should have rows with exactly (1 + 2*N) cells.
#     Returns a DataFrame with MultiIndex columns or None if the structure is not as expected.
#     """
#     thead = table_soup.find("thead")
#     if not thead:
#         return None

#     header_rows = thead.find_all("tr")
#     if len(header_rows) != 2:
#         return None  # Expect exactly two header rows

#     # ----- Row 0: Top-level headers -----
#     # The first <th> (with rowspan="2") should be empty and becomes the Party column header.
#     row0_cells = header_rows[0].find_all("th")
#     if not row0_cells or len(row0_cells) < 2:
#         return None
#     party_header = row0_cells[0].get_text(strip=True) or "Party"
#     top_headers = [party_header]  # first column header

#     # For the remaining cells, repeat each cell's text according to its colspan.
#     for cell in row0_cells[1:]:
#         try:
#             colspan = int(cell.get("colspan", "1"))
#         except ValueError:
#             colspan = 1
#         text = cell.get_text(strip=True)
#         for _ in range(colspan):
#             top_headers.append(text)

#     # ----- Row 1: Bottom-level headers (sub-headers) -----
#     row1_cells = header_rows[1].find_all("th")
#     if not row1_cells:
#         return None
#     # For the Party column, we assign an empty string.
#     bottom_headers = [""]
#     for cell in row1_cells:
#         bottom_headers.append(cell.get_text(strip=True))

#     if len(top_headers) != len(bottom_headers):
#         return None

#     # Build the MultiIndex for the DataFrame columns.
#     columns = pd.MultiIndex.from_arrays([top_headers, bottom_headers])

#     # ----- Parse the <tbody> -----
#     tbody = table_soup.find("tbody")
#     if not tbody:
#         return None

#     data_rows = []
#     for tr in tbody.find_all("tr"):
#         cells = tr.find_all(["td", "th"])
#         row_values = [cell.get_text(strip=True) for cell in cells]
#         if row_values:
#             data_rows.append(row_values)

#     # Only accept rows that match the number of header columns.
#     expected_cols = len(columns)
#     valid_rows = [row for row in data_rows if len(row) == expected_cols]
#     if not valid_rows:
#         return None

#     df = pd.DataFrame(valid_rows, columns=columns)
#     df.insert(0, "Table_Index", table_index)
#     df.insert(0, "Source_Page", source_name)
#     return df
# def parse_complex_table(table_soup, source_name, table_index):
#     """
#     Parse a table with a two-row <thead> structure that represents election results,
#     building a MultiIndex header as follows:
    
#     Expected <thead> structure:
#       <thead>
#         <tr>
#           <th rowspan="2"></th>         <!-- empty cell becomes "Party" -->
#           <th class="jahr" colspan="2">1990</th>
#           <th class="jahr" colspan="2">1994</th>
#           ... etc.
#         </tr>
#         <tr>
#           <th>%</th>
#           <th>Sitze</th>
#           <th>%</th>
#           <th>Sitze</th>
#           ... etc.
#         </tr>
#       </thead>
      
#     The final MultiIndex for columns will be:
#       [("Party", ""), ("1990", "%"), ("1990", "Sitze"), ("1994", "%"), ("1994", "Sitze"), …]
      
#     The data rows may be split across multiple <tbody> tags.
#     Returns a DataFrame with MultiIndex columns or None if the structure is not as expected.
#     """
#     # ----- Parse the header -----
#     thead = table_soup.find("thead")
#     if not thead:
#         return None  # No <thead> found; skip this table

#     header_rows = thead.find_all("tr")
#     if len(header_rows) != 2:
#         return None  # Expect exactly two header rows

#     # Row 0: Top-level headers
#     row0_cells = header_rows[0].find_all("th")
#     if not row0_cells or len(row0_cells) < 2:
#         return None
#     party_header = row0_cells[0].get_text(strip=True) or "Party"
#     top_headers = [party_header]
#     for cell in row0_cells[1:]:
#         try:
#             colspan = int(cell.get("colspan", "1"))
#         except ValueError:
#             colspan = 1
#         text = cell.get_text(strip=True)
#         for _ in range(colspan):
#             top_headers.append(text)

#     # Row 1: Sub-headers (lower-level headers)
#     row1_cells = header_rows[1].find_all("th")
#     if not row1_cells:
#         return None
#     bottom_headers = [""]
#     for cell in row1_cells:
#         bottom_headers.append(cell.get_text(strip=True))

#     if len(top_headers) != len(bottom_headers):
#         return None

#     # Create the MultiIndex columns
#     columns = pd.MultiIndex.from_arrays([top_headers, bottom_headers])

#     # ----- Parse the data from all <tbody> tags -----
#     tbodys = table_soup.find_all("tbody")
#     if not tbodys:
#         return None

#     data_rows = []
#     for tbody in tbodys:
#         for tr in tbody.find_all("tr"):
#             cells = tr.find_all(["td", "th"])
#             row_values = [cell.get_text(strip=True) for cell in cells]
#             if row_values:
#                 data_rows.append(row_values)

#     # Only keep rows that have the expected number of columns.
#     expected_cols = len(columns)
#     valid_rows = [row for row in data_rows if len(row) == expected_cols]
#     if not valid_rows:
#         return None

#     df = pd.DataFrame(valid_rows, columns=columns)
#     df.insert(0, "Table_Index", table_index)
#     df.insert(0, "Source_Page", source_name)
#     return df
# async def scrape_page_tables(url, name):
#     """
#     Scrape all <table> tags from a given URL.
#     We'll try parse_complex_table() first.
#     If that fails (None), we'll fallback to the naive approach.
#     """
#     async with async_playwright() as p:
#         browser = await p.chromium.launch(headless=True)
#         page = await browser.new_page()
#         await page.goto(url)
        
#         # Get page HTML
#         html = await page.content()
#         soup = BeautifulSoup(html, "html.parser")
        
#         # Find all <table>
#         tables = soup.find_all("table")
        
#         dfs = []
#         for i, table in enumerate(tables):
#             # Try complex header parsing
#             df_complex = parse_complex_table(table, name, i)  # <-- uses the new version
#             if df_complex is not None:
#                 # If we successfully parsed a complex header, use that
#                 dfs.append(df_complex)
#             else:
#                 # Fallback: naive approach
#                 rows = table.find_all("tr")
#                 data = []
#                 # Attempt to detect headers in the first row if it has <th>
#                 if rows and rows[0].find("th"):
#                     headers = [th.get_text(strip=True) for th in rows[0].find_all("th")]
#                     data_start_idx = 1
#                 else:
#                     headers = None
#                     data_start_idx = 0

#                 # Extract rows
#                 for row in rows[data_start_idx:]:
#                     cols = [td.get_text(strip=True) for td in row.find_all("td")]
#                     if cols:
#                         data.append(cols)

#                 max_cols = max((len(r) for r in data), default=0)
#                 if headers and len(headers) != max_cols:
#                     headers = None

#                 if data:
#                     df = pd.DataFrame(data, columns=headers if headers else None)
#                     df.insert(0, "Table_Index", i)
#                     df.insert(0, "Source_Page", name)
#                     dfs.append(df)

#         await browser.close()
#         return dfs
# # ... [rest of your code remains unchanged]

# async def main():
#     """Main flow: get links, then scrape each page's tables."""
#     links = await get_links_from_base()
#     if not links:
#         print("No links found from the base page.")
#         return
    
#     print(f"Found {len(links)} links from base page.")
    
#     all_results = {}
#     for link_text, link_url in links:
#         print(f"\nScraping: {link_text} -> {link_url}")
#         page_dfs = await scrape_page_tables(link_url, link_text)
#         if page_dfs:
#             all_results[link_text] = page_dfs

#     # Save/Display only the tables that meet our column threshold
#     min_columns = 6  # change this number as appropriate for your complex tables
#     for state_name, dfs in all_results.items():
#         for idx, df in enumerate(dfs):
#             if df.shape[1] < min_columns:
#                 # Skip tables that are too "simple"
#                 continue

#             print(f"\nDataFrame from {state_name} (Table {idx}):")
#             display(df.head())
#             csv_name = f"{state_name.replace(' ', '_')}_table_{idx}.csv"
#             df.to_csv(csv_name, index=False)
#             print(f"Saved: {csv_name}")

# # Run in Jupyter
# await main()

Data Transformation & Cleaning

import glob
# Path to your CSV folder
csv_folder = "state_election_results/*.csv"

# Find all CSV files
files = glob.glob(csv_folder)

all_long_dfs = []

for file in files:
    df = pd.read_csv(file)
    
    # The first 3 columns are "Source_Page", "Table_Index", "Party"
    fixed_cols = df.columns[:3]  # e.g. ['Source_Page', 'Table_Index', 'Party']
    # The rest are the repeated year columns
    repeated_years = df.columns[3:]
    
    # Rename the repeated columns in pairs: 
    # e.g. "1952", "1952" -> "1952_%", "1952_Sitze"
    new_cols = list(fixed_cols)  # start with the first 3 columns as-is
    
    # Make sure we have an even number of columns in repeated_years
    # If it's odd, there's a leftover column you might need to handle or skip.
    if len(repeated_years) % 2 != 0:
        print(f"Warning: {file} has an odd number of repeated columns. Check manually!")
    
    # Build new column names in pairs
    for i in range(0, len(repeated_years), 2):
        year_col_1 = repeated_years[i]
        # The second column in the pair:
        if i+1 < len(repeated_years):
            year_col_2 = repeated_years[i+1]
        else:
            year_col_2 = None
        
        # We assume both are the same year label (like "1952", "1952"),
        # so we can get the year from the first
        year_label = year_col_1
        
        # Create new names
        new_cols.append(f"{year_label}_%")
        new_cols.append(f"{year_label}_Sitze")
    
    # Assign the new column names to df
    df.columns = new_cols
    
    # Now "df" has columns like:
    # Source_Page, Table_Index, Party, 1952_%, 1952_Sitze, 1956_%, 1956_Sitze, ...
    
    # Melt (unpivot) from wide to long
    # id_vars = the columns we don't want to melt
    id_vars = ["Source_Page", "Table_Index", "Party"]
    value_vars = df.columns[len(id_vars):]  # all the year_% and year_Sitze columns
    
    long_df = pd.melt(
        df,
        id_vars=id_vars,
        value_vars=value_vars,
        var_name="Year_Stat",
        value_name="Value"
    )
    
    # "Year_Stat" might look like "1952_%", "1952_Sitze"
    # Split into "Year" and "StatType"
    long_df[["Year", "StatType"]] = long_df["Year_Stat"].str.split("_", expand=True)
    
    # Keep only the relevant columns
    long_df = long_df[["Source_Page", "Party", "Year", "StatType", "Value"]]
    
    # Append to list
    all_long_dfs.append(long_df)
# Concatenate all long dataframes
full_long_df = pd.concat(all_long_dfs, ignore_index=True)
full_long_df.columns = full_long_df.columns.str.lower()
full_long_df.columns
Index(['source_page', 'party', 'year', 'stattype', 'value'], dtype='object')
full_long_df
source_page party year stattype value
0 NaN NaN 1946 % %
1 Hessen Wahlbe­teiligung 1946 % 73,2
2 Hessen CDU 1946 % 30,9
3 Hessen SPD 1946 % 42,7
4 Hessen FDP 1946 % 15,7
... ... ... ... ... ...
8243 Schleswig-Holstein DP³ 2022 Sitze
8244 Schleswig-Holstein KPD/DKP¹ 2022 Sitze
8245 Schleswig-Holstein REP 2022 Sitze
8246 Schleswig-Holstein DVU 2022 Sitze
8247 Schleswig-Holstein Sonstige 2022 Sitze

8248 rows × 5 columns

full_long_df[full_long_df["source_page"].isna()]
source_page party year stattype value
0 NaN NaN 1946 % %
14 NaN NaN 1946 Sitze Sitze
28 NaN NaN 1950 % %
42 NaN NaN 1950 Sitze Sitze
56 NaN NaN 1954 % %
... ... ... ... ... ...
8163 NaN NaN 2012 Sitze Sitze
8180 NaN NaN 2017 % %
8197 NaN NaN 2017 Sitze Sitze
8214 NaN NaN 2022 % %
8231 NaN NaN 2022 Sitze Sitze

504 rows × 5 columns

cleaned_full_long_df = full_long_df.dropna(subset=["source_page"]).reset_index(drop=True)
cleaned_full_long_df
source_page party year stattype value
0 Hessen Wahlbe­teiligung 1946 % 73,2
1 Hessen CDU 1946 % 30,9
2 Hessen SPD 1946 % 42,7
3 Hessen FDP 1946 % 15,7
4 Hessen GRÜNE 1946 %
... ... ... ... ... ...
7739 Schleswig-Holstein DP³ 2022 Sitze
7740 Schleswig-Holstein KPD/DKP¹ 2022 Sitze
7741 Schleswig-Holstein REP 2022 Sitze
7742 Schleswig-Holstein DVU 2022 Sitze
7743 Schleswig-Holstein Sonstige 2022 Sitze

7744 rows × 5 columns

cleaned_full_long_df = cleaned_full_long_df.rename(columns={'source_page': 'state'})
cleaned_full_long_df
state party year stattype value
0 Hessen Wahlbe­teiligung 1946 % 73,2
1 Hessen CDU 1946 % 30,9
2 Hessen SPD 1946 % 42,7
3 Hessen FDP 1946 % 15,7
4 Hessen GRÜNE 1946 %
... ... ... ... ... ...
7739 Schleswig-Holstein DP³ 2022 Sitze
7740 Schleswig-Holstein KPD/DKP¹ 2022 Sitze
7741 Schleswig-Holstein REP 2022 Sitze
7742 Schleswig-Holstein DVU 2022 Sitze
7743 Schleswig-Holstein Sonstige 2022 Sitze

7744 rows × 5 columns

# Create a mask for rows where stattype is '%'
mask = cleaned_full_long_df['stattype'] == '%'

# Replace comma with period and convert to numeric
cleaned_full_long_df.loc[mask, 'value'] = cleaned_full_long_df.loc[mask, 'value'].str.replace(',', '.')
cleaned_full_long_df.loc[mask, 'value'] = pd.to_numeric(cleaned_full_long_df.loc[mask, 'value'], errors='coerce') / 100
# Filter for voter turnout rows
turnout_df = cleaned_full_long_df[cleaned_full_long_df['party'] == 'Wahlbe­teiligung'].copy()

# Ensure the 'year' column is numeric so that sorting works correctly.
turnout_df['year'] = pd.to_numeric(turnout_df['year'], errors='coerce')

# Pivot the data so that each (state, year) row gets both "%" and "Sitze" values
turnout_table = turnout_df.pivot_table(
    index=["state", "year"],
    columns="stattype",
    values="value",
    aggfunc="first"  # if there are duplicates, you can use "mean" or another aggregator
).reset_index()

# Remove the name for the columns index and rename the turnout columns for clarity
turnout_table.columns.name = None
turnout_table = turnout_table.rename(columns={"%": "Turnout_Percentage", "Sitze": "Turnout_Sitze"})

# Optionally, sort the table by state and year
turnout_table = turnout_table.sort_values(by=["state", "year"])

turnout_table.head()
state year Turnout_Percentage Turnout_Sitze
0 Baden-Württemberg 1952 0.637 121
1 Baden-Württemberg 1956 0.703 120
2 Baden-Württemberg 1960 0.59 121
3 Baden-Württemberg 1964 0.677 120
4 Baden-Württemberg 1968 0.7 127
cleaned_full_long_df = cleaned_full_long_df[cleaned_full_long_df["party"] != "Wahlbe­teiligung"].reset_index(drop=True)
cleaned_full_long_df
state party year stattype value
0 Hessen CDU 1946 % 0.309
1 Hessen SPD 1946 % 0.427
2 Hessen FDP 1946 % 0.157
3 Hessen GRÜNE 1946 % NaN
4 Hessen DIE LINKE 1946 % NaN
... ... ... ... ... ...
7515 Schleswig-Holstein DP³ 2022 Sitze
7516 Schleswig-Holstein KPD/DKP¹ 2022 Sitze
7517 Schleswig-Holstein REP 2022 Sitze
7518 Schleswig-Holstein DVU 2022 Sitze
7519 Schleswig-Holstein Sonstige 2022 Sitze

7520 rows × 5 columns

cleaned_full_long_df[cleaned_full_long_df["party"]=="Wahlbe­teiligung"]
state party year stattype value
# cleaned_full_long_df.to_csv("test.csv", index = False)
percent_votes = cleaned_full_long_df[cleaned_full_long_df["stattype"]=="%"].reset_index(drop = True)
percent_votes

# percent_votes.to_csv("percentage_votes.csv", index = False)
state party year stattype value
0 Hessen CDU 1946 % 0.309
1 Hessen SPD 1946 % 0.427
2 Hessen FDP 1946 % 0.157
3 Hessen GRÜNE 1946 % NaN
4 Hessen DIE LINKE 1946 % NaN
... ... ... ... ... ...
3755 Schleswig-Holstein DP³ 2022 % NaN
3756 Schleswig-Holstein KPD/DKP¹ 2022 % NaN
3757 Schleswig-Holstein REP 2022 % NaN
3758 Schleswig-Holstein DVU 2022 % NaN
3759 Schleswig-Holstein Sonstige 2022 % 0.038

3760 rows × 5 columns

seats_won = cleaned_full_long_df[cleaned_full_long_df["stattype"]=="Sitze"].reset_index(drop = True)
seats_won
# seats_won.to_csv("seats_won.csv", index = False)
state party year stattype value
0 Hessen CDU 1946 Sitze 28
1 Hessen SPD 1946 Sitze 38
2 Hessen FDP 1946 Sitze 14
3 Hessen GRÜNE 1946 Sitze
4 Hessen DIE LINKE 1946 Sitze
... ... ... ... ... ...
3755 Schleswig-Holstein DP³ 2022 Sitze
3756 Schleswig-Holstein KPD/DKP¹ 2022 Sitze
3757 Schleswig-Holstein REP 2022 Sitze
3758 Schleswig-Holstein DVU 2022 Sitze
3759 Schleswig-Holstein Sonstige 2022 Sitze

3760 rows × 5 columns

Analysis