From 5ed0352b96b623c5096616df09d244e78e7ba944 Mon Sep 17 00:00:00 2001 From: Jarvis Date: Tue, 23 Dec 2025 10:16:22 +0800 Subject: [PATCH] Initial setup with privacy protection --- .github/workflows/rss_action.yaml | 41 ++++++ get_RSS.py | 203 ++++++++++++++++++++++++++++++ journals.dat | 108 ++++++++++++++++ requirements.txt | 2 + 4 files changed, 354 insertions(+) create mode 100644 .github/workflows/rss_action.yaml create mode 100644 get_RSS.py create mode 100644 journals.dat create mode 100644 requirements.txt diff --git a/.github/workflows/rss_action.yaml b/.github/workflows/rss_action.yaml new file mode 100644 index 0000000..5114c6d --- /dev/null +++ b/.github/workflows/rss_action.yaml @@ -0,0 +1,41 @@ +name: Auto RSS Fetch + +on: + schedule: + - cron: '0 */6 * * *' # 每6小时运行一次 + workflow_dispatch: # 允许手动触发 + +jobs: + build: + runs-on: ubuntu-latest + + # 这一步非常重要:赋予脚本写入仓库的权限 + permissions: + contents: write + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Run RSS Script + env: + RSS_KEYWORDS: ${{ secrets.RSS_KEYWORDS }} + run: python get_RSS.py + + - name: Commit and Push changes + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git add filtered_feed.xml + # 只有当有新文件生成或文件变动时才提交 + git commit -m "Auto-update RSS feed" || echo "No changes to commit" + git push \ No newline at end of file diff --git a/get_RSS.py b/get_RSS.py new file mode 100644 index 0000000..a2dfc12 --- /dev/null +++ b/get_RSS.py @@ -0,0 +1,203 @@ +import feedparser +import re +import os +import datetime +import time +from rfeed import Item, Feed, Guid +from email.utils import parsedate_to_datetime + +# --- 配置区域 --- +OUTPUT_FILE = "filtered_feed.xml" # 输出文件 +MAX_ITEMS = 200 # RSS中保留的最大条目数(滚动窗口) +JOURNALS_FILE = 'journals.dat' +KEYWORDS_FILE = 'keywords.dat' +# ---------------- + +def load_config(filename, env_var_name=None): + """ + 优先从环境变量读取配置(用于 GitHub Actions 保护隐私), + 如果环境变量不存在,则读取本地文件(用于本地测试)。 + """ + # 1. 尝试读取环境变量 (Secrets) + if env_var_name and os.environ.get(env_var_name): + print(f"Loading config from environment variable: {env_var_name}") + # 假设环境变量里用分号 ; 或者换行符分隔 + content = os.environ[env_var_name] + # 兼容换行符或分号分隔 + if '\n' in content: + return [line.strip() for line in content.split('\n') if line.strip()] + else: + return [line.strip() for line in content.split(';') if line.strip()] + + # 2. 尝试读取本地文件 + if os.path.exists(filename): + print(f"Loading config from local file: {filename}") + with open(filename, 'r', encoding='utf-8') as f: + return [line.strip() for line in f if line.strip() and not line.startswith('#')] + + print(f"Warning: No config found for {filename} or {env_var_name}") + return [] + +def convert_struct_time_to_datetime(struct_time): + """将 feedparser 的时间结构转换为 datetime 对象""" + if not struct_time: + return datetime.datetime.now() + return datetime.datetime.fromtimestamp(time.mktime(struct_time)) + +def parse_rss(rss_url, retries=3): + """解析在线 RSS 订阅""" + print(f"Fetching: {rss_url}...") + for attempt in range(retries): + try: + feed = feedparser.parse(rss_url) + entries = [] + journal_title = feed.feed.get('title', 'Unknown Journal') + + for entry in feed.entries: + # 获取标准时间 + pub_struct = entry.get('published_parsed', entry.get('updated_parsed')) + pub_date = convert_struct_time_to_datetime(pub_struct) + + entries.append({ + 'title': entry.get('title', ''), + 'link': entry.get('link', ''), + 'pub_date': pub_date, + 'summary': entry.get('summary', entry.get('description', '')), + 'journal': journal_title, + 'id': entry.get('id', entry.get('link', '')) # ID 用于去重 + }) + return entries + except Exception as e: + print(f"Error parsing {rss_url}: {e}") + time.sleep(2) + return [] + +def get_existing_items(): + """读取上一次生成的 XML 文件,保留历史数据""" + if not os.path.exists(OUTPUT_FILE): + return [] + + print(f"Loading existing items from {OUTPUT_FILE}...") + try: + # feedparser 也可以解析本地 XML 文件 + feed = feedparser.parse(OUTPUT_FILE) + entries = [] + for entry in feed.entries: + # 恢复 datetime 对象 + pub_struct = entry.get('published_parsed') + pub_date = convert_struct_time_to_datetime(pub_struct) + + # 注意:生成的 XML 标题通常是 "[Journal] Title",这里我们需要尽量保持原样 + # 或者为了简单起见,我们直接存储读取到的内容 + entries.append({ + 'title': entry.get('title', ''), # 这里标题已经包含 [Journal] 前缀了 + 'link': entry.get('link', ''), + 'pub_date': pub_date, + 'summary': entry.get('summary', ''), + 'journal': entry.get('author', ''), # 我们在生成时把 journal 存入了 author 字段 + 'id': entry.get('id', entry.get('link', '')), + 'is_old': True # 标记为旧数据,不需要再次关键词匹配 + }) + return entries + except Exception as e: + print(f"Error reading existing file: {e}") + return [] + +def match_entry(entry, queries): + """关键词匹配""" + # 构造待搜索文本 + text_to_search = (entry['title'] + " " + entry['summary']).lower() + + for query in queries: + keywords = [k.strip().lower() for k in query.split('AND')] + match = True + for keyword in keywords: + # 使用简单的字符串包含判断,比正则更快,且对科研关键词通常足够 + if keyword not in text_to_search: + match = False + break + if match: + return True + return False + +def generate_rss_xml(items): + """生成 RSS 2.0 XML 文件""" + rss_items = [] + + # 按时间倒序排列(最新的在最前) + # 确保所有 item 都有 pub_date 且是 datetime 对象 + items.sort(key=lambda x: x['pub_date'], reverse=True) + + # 截取最新的 MAX_ITEMS 条 + items = items[:MAX_ITEMS] + + for item in items: + # 如果是旧数据,标题可能已经是 "[Journal] Title" 格式,需要避免重复添加前缀 + title = item['title'] + if not item.get('is_old', False): + # 新数据,添加期刊前缀 + title = f"[{item['journal']}] {item['title']}" + + rss_item = Item( + title = title, + link = item['link'], + description = item['summary'], + author = item['journal'], # 借用 author 字段存储期刊名 + guid = Guid(item['id']), + pubDate = item['pub_date'] + ) + rss_items.append(rss_item) + + feed = Feed( + title = "My Customized Papers (Auto-Filtered)", + link = "https://github.com/your_username/your_repo", + description = "Aggregated research papers based on keywords", + language = "en-US", + lastBuildDate = datetime.datetime.now(), + items = rss_items + ) + + with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + f.write(feed.rss()) + print(f"Successfully generated {OUTPUT_FILE} with {len(rss_items)} items.") + +def main(): + # 1. 读取配置 + rss_urls = load_config('journals.dat', 'RSS_JOURNALS') + queries = load_config('keywords.dat', 'RSS_KEYWORDS') + + if not rss_urls or not queries: + print("Error: Configuration files are empty or missing.") + return + + # 2. 读取旧数据(核心去重策略:保留历史) + existing_entries = get_existing_items() + # 创建一个已有 ID 的集合,用于快速查重 + seen_ids = set(entry['id'] for entry in existing_entries) + + all_entries = existing_entries.copy() + new_count = 0 + + # 3. 抓取新数据 + print("Starting RSS fetch from remote...") + for url in rss_urls: + fetched_entries = parse_rss(url) + for entry in fetched_entries: + # 查重:如果 ID 已经在旧数据里,直接跳过 + if entry['id'] in seen_ids: + continue + + # 关键词匹配 + if match_entry(entry, queries): + all_entries.append(entry) + seen_ids.add(entry['id']) + new_count += 1 + print(f"Match found: {entry['title'][:50]}...") + + print(f"Added {new_count} new entries. Total entries before limit: {len(all_entries)}") + + # 4. 生成新文件 (包含排序和截断) + generate_rss_xml(all_entries) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/journals.dat b/journals.dat new file mode 100644 index 0000000..074052d --- /dev/null +++ b/journals.dat @@ -0,0 +1,108 @@ +feeds.aps.org/rss/recent/prmaterials.xml +feeds.aps.org/rss/recent/prx.xml +feeds.aps.org/rss/tocsec/PRL-CondensedMatterStructureetc.xml +feeds.aps.org/rss/tocsec/PRB-Structurestructuralphasetransitionsmechanicalpropertiesdefects.xml +feeds.aps.org/rss/tocsec/PRB-SemiconductorsIbulk.xml +https://advanced.onlinelibrary.wiley.com/feed/15214095/most-recent +https://advanced.onlinelibrary.wiley.com/feed/16163028/most-recent +https://www.science.org/action/showFeed?type=etoc&feed=rss&jc=science +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=aelccp +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=ancac3 +https://onlinelibrary.wiley.com/feed/16146840/most-recent +https://advanced.onlinelibrary.wiley.com/feed/16163028/most-recent +https://advanced.onlinelibrary.wiley.com/feed/15214095/most-recent +https://onlinelibrary.wiley.com/feed/15213773/most-recent +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jacsat +https://rss.sciencedirect.com/publication/science/25424351 +https://rss.sciencedirect.com/publication/science/25902385 +https://rss.sciencedirect.com/publication/science/22112855 +https://www.nature.com/nature.rss +https://www.nature.com/ncomms.rss +https://www.nature.com/nenergy.rss +https://www.nature.com/nmat.rss +https://www.nature.com/nphys.rss +https://www.nature.com/natrevmats.rss +https://www.nature.com/npjcompumats.rss +https://onlinelibrary.wiley.com/feed/25673173/most-recent +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jctcce +https://pubs.aip.org/rss/site_1000011/1000008.xml +https://pubs.aip.org/rss/site_1000009/1000007.xml +https://pubs.aip.org/rss/site_1000017/1000011.xml +https://pubs.aip.org/rss/site_1000017/LatestOpenIssueArticles_1000011.xml +https://pubs.aip.org/rss/site_1000019/1000012.xml +https://pubs.aip.org/rss/site_1000019/LatestOpenIssueArticles_1000012.xml +https://pubs.aip.org/rss/site_1000013/1000009.xml +https://pubs.aip.org/rss/site_1000013/LatestOpenIssueArticles_1000009.xml +https://rss.sciencedirect.com/publication/science/13697021 +https://rss.sciencedirect.com/publication/science/25425293 +https://www.tandfonline.com/feed/rss/tadp20 +https://feeds.aps.org/rss/recent/rmp.xml +https://feeds.aps.org/rss/recent/prl.xml +https://feeds.aps.org/rss/tocsec/PRB-Dynamicsdynamicalsystemslatticeeffectsquantumsolids.xml +https://feeds.aps.org/rss/recent/prb.xml +https://www.nature.com/natrevphys.rss +https://www.nature.com/nnano.rss +https://rss.sciencedirect.com/publication/science/00796425 +https://advanced.onlinelibrary.wiley.com/feed/21983844/most-recent +https://www.pnas.org/action/showFeed?type=etoc&feed=rss&jc=PNAS +https://www.pnas.org/action/showFeed?type=searchTopic&taxonomyCode=topic&tagCode=phys-sci +https://rss.arxiv.org/rss/cond-mat +https://rss.sciencedirect.com/publication/science/20959273 +https://rss.sciencedirect.com/publication/science/26671417 +https://www.nature.com/commsmat.rss +https://rss.sciencedirect.com/publication/science/29497477 +https://rss.sciencedirect.com/publication/science/09270256 +https://rss.sciencedirect.com/publication/science/01672738 +https://www.annualreviews.org/rss/content/journals/conmatphys/latestarticles?fmt=rss +https://www.annualreviews.org/rss/content/journals/matsci/latestarticles?fmt=rss +https://www.nature.com/natmachintell.rss +https://rss.sciencedirect.com/publication/science/00796425 +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jpclcd +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jpccck +https://rss.sciencedirect.com/publication/science/2352152X +https://rss.sciencedirect.com/publication/science/13590286 +http://feeds.rsc.org/rss/dd +https://rss.sciencedirect.com/publication/science/23528478 +https://chemrxiv.org/engage/rss/chemrxiv +https://advanced.onlinelibrary.wiley.com/feed/29439981/most-recent +https://rss.sciencedirect.com/publication/science/30509130 +https://feeds.aps.org/rss/recent/prxenergy.xml +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=achre4 +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=amrcda +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=amlcef +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=amacgu +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=apcach +https://iopscience.iop.org/journal/rss/3050-287X +https://ccs-stag.literatumonline.com/action/showFeed?type=etoc&feed=rss&jc=ccschem +https://www.nature.com/natchemeng.rss +http://feeds.rsc.org/rss/sc +https://www.cell.com/iscience/current.rss +https://www.cell.com/iscience/inpress.rss +https://www.cell.com/Joule/current.rss +https://www.cell.com/joule/inpress.rss +https://www.cell.com/Matter/current.rss +https://www.cell.com/matter/inpress.rss +https://www.cell.com/Newton/current.rss +https://www.cell.com/Newton/inpress.rss +https://www.cell.com/Chem/current.rss +https://www.cell.com/chem/inpress.rss +https://www.cell.com/chem-catalysis/inpress.rss +https://www.cell.com/chem-catalysis/current.rss +https://www.cell.com/Cell-Reports-Physical-Science/inpress.rss +https://www.cell.com/Cell-Reports-Physical-Science/current.rss +https://onlinelibrary.wiley.com/feed/16136829/most-recent +https://onlinelibrary.wiley.com/feed/23669608/most-recent +https://onlinelibrary.wiley.com/feed/26379368/most-recent +https://onlinelibrary.wiley.com/feed/16147065/most-recent +https://onlinelibrary.wiley.com/feed/25750356/most-recent +https://onlinelibrary.wiley.com/feed/25673165/most-recent +https://onlinelibrary.wiley.com/feed/2688819x/most-recent +https://onlinelibrary.wiley.com/feed/26884062/most-recent +https://onlinelibrary.wiley.com/feed/26924552/most-recent +https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jaaucr +https://www.tandfonline.com/feed/rss/tmrl20 +https://rss.sciencedirect.com/publication/science/25425293 +https://rss.sciencedirect.com/publication/science/13596454 +https://rss.sciencedirect.com/publication/science/18722067 +https://rss.sciencedirect.com/publication/science/00219517 +https://www.nature.com/commsphys.rss \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d563d9e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +feedparser +rfeed \ No newline at end of file