print("== 実行中のスクリプト: 記事一覧タグ付3列+本文+カテゴリ取得 ==")
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import csv
from datetime import datetime
# --- 設定エリア ---
driver_path = r"C:\tools\chromedriver\chromedriver.exe"
login_needed = True
your_blog_url = "https://yozda.exblog.jp/"
your_login_id = "yozda"
your_login_pw = "************************"
BASE_URL = "https://userconf.exblog.jp/posts/index.php?page="
CSV_FILENAME = "blog_articles_with_tags.csv"
DEBUG_STOP_PAGE = None # 例)2なら2ページまで、Noneで全ページ
# --- ここまで設定エリア ---
# ログの設定
def log(msg):
now = datetime.now().strftime("[%H:%M:%S]")
print(f"{now} {msg}")
start_time = time.time()
# 警告メッセージの最小化
chrome_options = Options()
chrome_options.add_argument("--log-level=3")
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)
if login_needed:
driver.get("https://ssl2.excite.co.jp/idc/login/")
wait = WebDriverWait(driver, 15)
email_input = wait.until(EC.presence_of_element_located((By.NAME, "loginid")))
email_input.send_keys(your_login_id)
password_input = driver.find_element(By.NAME, "password")
password_input.send_keys(your_login_pw)
driver.find_element(By.CLASS_NAME, "btn_login").click()
time.sleep(3)
# ブログを開く
driver.get(f"{your_blog_url}")
time.sleep(2)
# CSV初期化(見出し行書き込み)
with open(CSV_FILENAME, mode="w", newline="", encoding="shift_jis", errors="ignore") as f:
writer = csv.writer(f)
writer.writerow(["タイトル", "リンク", "日付", "カテゴリ", "公開状態", "タグ1", "タグ2", "タグ3", "本文"])
page = 1
while True:
url = f"{BASE_URL}{page}"
log(f"取得中: {url}")
driver.get(url)
time.sleep(2)
if DEBUG_STOP_PAGE is not None and page > DEBUG_STOP_PAGE:
log(f"デバッグ指定: {DEBUG_STOP_PAGE}ページ直前で終了")
break
rows = driver.find_elements(By.CSS_SELECTOR, "tbody tr")
if not rows:
log("記事が見つかりません。")
break
for row in rows:
try:
title_elem = row.find_element(By.CSS_SELECTOR, "td.headline a")
title = title_elem.text.strip()
link = title_elem.get_attribute("href").strip()
try:
title_elem.find_element(By.TAG_NAME, "img")
visibility = "非公開"
except:
visibility = "公開"
except:
continue
date = row.find_element(By.CSS_SELECTOR, "td.date").text.strip() if row.find_elements(By.CSS_SELECTOR, "td.date") else ""
# 個別記事ページからカテゴリ・タグ・本文取得
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[1])
driver.get(link)
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
time.sleep(0.5)
except:
pass
# 正式タイトル
try:
h1 = driver.find_element(By.CSS_SELECTOR, "h1.post-title")
title = h1.text.strip()
except:
pass
# 本文
try:
body_elem = driver.find_element(By.CSS_SELECTOR, "div.post-main")
body = body_elem.text.strip().replace("\u3000", " ").replace("\n", " ")
except:
body = ""
# タグ
tags = []
try:
tag_elements = driver.find_elements(By.CSS_SELECTOR, "ul.taglist-list li a")
for elem in tag_elements:
tag_text = elem.text.strip()
if tag_text:
tags.append(tag_text)
except:
pass
tag1 = tags[0] if len(tags) > 0 else ""
tag2 = tags[1] if len(tags) > 1 else ""
tag3 = tags[2] if len(tags) > 2 else ""
# カテゴリ(記事末尾のリンク)
try:
category_link = driver.find_element(By.CSS_SELECTOR, "span.TIME a:nth-of-type(2)")
category = category_link.text.strip()
except:
category = ""
# CSV追記
with open(CSV_FILENAME, mode="a", newline="", encoding="shift_jis", errors="ignore") as f:
writer = csv.writer(f)
writer.writerow([title, link, date, category, visibility, tag1, tag2, tag3, body])
log(f"✔ 保存: {title[:30]}...")
driver.close()
driver.switch_to.window(driver.window_handles[0])
try:
next_button = driver.find_element(By.CSS_SELECTOR, "li.next a")
if next_button:
page += 1
else:
break
except:
break
driver.quit()
elapsed = time.time() - start_time
log(f"完了!経過時間: {elapsed:.1f}秒")