pages/bin/update_scholar_citations.py
Dominik Fuchß 1f349ffda2
Adapt mechanism for citiation count (#3177)
## Summary

This PR introduces an automated mechanism to update the citation count
for authors' publications.

- Inspired by @BernardoCama’s suggestion in #3150.
- Resolves #3150.

## Key Changes

- Adds an action to update publication citation counts.
    - Note: This action creates a commit on the main branch.
- To trigger further GitHub Actions workflows from this commit, a
Personal Access Token (PAT) must be used (the default GitHub Actions
token cannot trigger subsequent workflows).
- Adds and manages citation data in `_data/citations.yml`.
- Adds and adapts `bin/update_scholar_citations.py` to handle citation
updates.

## Usage Examples
### Timeout
<img width="758" height="415" alt="image"
src="https://github.com/user-attachments/assets/0a330d35-b386-4670-8668-62701f2dc68b"
/>

### Success

<img width="1684" height="857" alt="image"
src="https://github.com/user-attachments/assets/44aa0558-e02a-4f00-b8cb-9e0ce16dd53c"
/>
2025-11-16 22:17:17 -03:00

133 lines
4.6 KiB
Python

#!/usr/bin/env python
import os
import sys
import yaml
from datetime import datetime
from scholarly import scholarly
def load_scholar_user_id() -> str:
"""Load the Google Scholar user ID from the configuration file."""
config_file = "_data/socials.yml"
if not os.path.exists(config_file):
print(
f"Configuration file {config_file} not found. Please ensure the file exists and contains your Google Scholar user ID."
)
sys.exit(1)
try:
with open(config_file, "r") as f:
config = yaml.safe_load(f)
scholar_user_id = config.get("scholar_userid")
if not scholar_user_id:
print(
"No 'scholar_userid' found in the configuration file. Please add 'scholar_userid' to _data/socials.yml."
)
sys.exit(1)
return scholar_user_id
except yaml.YAMLError as e:
print(
f"Error parsing YAML file {config_file}: {e}. Please check the file for correct YAML syntax."
)
sys.exit(1)
SCHOLAR_USER_ID: str = load_scholar_user_id()
OUTPUT_FILE: str = "_data/citations.yml"
def get_scholar_citations() -> None:
"""Fetch and update Google Scholar citation data."""
print(f"Fetching citations for Google Scholar ID: {SCHOLAR_USER_ID}")
today = datetime.now().strftime("%Y-%m-%d")
# Check if the output file was already updated today
if os.path.exists(OUTPUT_FILE):
try:
with open(OUTPUT_FILE, "r") as f:
existing_data = yaml.safe_load(f)
if (
existing_data
and "metadata" in existing_data
and "last_updated" in existing_data["metadata"]
):
print(f"Last updated on: {existing_data['metadata']['last_updated']}")
if existing_data["metadata"]["last_updated"] == today:
print("Citations data is already up-to-date. Skipping fetch.")
return
except Exception as e:
print(
f"Warning: Could not read existing citation data from {OUTPUT_FILE}: {e}. The file may be missing or corrupted."
)
citation_data = {"metadata": {"last_updated": today}, "papers": {}}
scholarly.set_timeout(15)
scholarly.set_retries(3)
try:
author = scholarly.search_author_id(SCHOLAR_USER_ID)
author_data = scholarly.fill(author)
except Exception as e:
print(
f"Error fetching author data from Google Scholar for user ID '{SCHOLAR_USER_ID}': {e}. Please check your internet connection and Scholar user ID."
)
sys.exit(1)
if not author_data:
print(
f"Could not fetch author data for user ID '{SCHOLAR_USER_ID}'. Please verify the Scholar user ID and try again."
)
sys.exit(1)
if "publications" not in author_data:
print(f"No publications found in author data for user ID '{SCHOLAR_USER_ID}'.")
sys.exit(1)
for pub in author_data["publications"]:
try:
pub_id = pub.get("pub_id") or pub.get("author_pub_id")
if not pub_id:
print(
f"Warning: No ID found for publication: {pub.get('bib', {}).get('title', 'Unknown')}. This publication will be skipped."
)
continue
title = pub.get("bib", {}).get("title", "Unknown Title")
year = pub.get("bib", {}).get("pub_year", "Unknown Year")
citations = pub.get("num_citations", 0)
print(f"Found: {title} ({year}) - Citations: {citations}")
citation_data["papers"][pub_id] = {
"title": title,
"year": year,
"citations": citations,
}
except Exception as e:
print(
f"Error processing publication '{pub.get('bib', {}).get('title', 'Unknown')}': {e}. This publication will be skipped."
)
# Compare new data with existing data
if existing_data and existing_data.get("papers") == citation_data["papers"]:
print("No changes in citation data. Skipping file update.")
return
try:
with open(OUTPUT_FILE, "w") as f:
yaml.dump(citation_data, f, width=1000, sort_keys=True)
print(f"Citation data saved to {OUTPUT_FILE}")
except Exception as e:
print(
f"Error writing citation data to {OUTPUT_FILE}: {e}. Please check file permissions and disk space."
)
sys.exit(1)
if __name__ == "__main__":
try:
get_scholar_citations()
except Exception as e:
print(f"Unexpected error: {e}")
sys.exit(1)