那個SSA上的AWI數據下載搞定了,用selsium browser driver

來源: 2026-01-03 22:08:37 [博客] [舊帖] [給我悄悄話] 本文已被閱讀:

大概這樣:

#########################################################
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

# 1. Set up Chrome options to avoid detection
options = Options()
options.add_argument("start-maximized") # Maximize window to mimic typical user behavior
options.add_experimental_option("excludeSwitches", ["enable-automation"]) # Exclude the automation switch
options.add_experimental_option('useAutomationExtension', False)
# You may also want to add a custom, realistic user agent string for an extra layer of customization
# options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36')

# 2. Initialize the WebDriver
driver = webdriver.Chrome(options=options)

# 3. Apply selenium_stealth
stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )

# 4. Navigate to the SSA website
driver.get("https://www.ssa.gov/oact/cola/awiseries.html")

html_string = driver.page_source

soup = BeautifulSoup(html_string, 'html.parser')

tbl = soup.find('table', attrs={'summary': "AWI series and annual changes"})

pd.read_html(StringIO(str(tbl)))[0].set_index('Year')

#########################################################