Initial setup with privacy protection
This commit is contained in:
41
.github/workflows/rss_action.yaml
vendored
Normal file
41
.github/workflows/rss_action.yaml
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
name: Auto RSS Fetch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 */6 * * *' # 每6小时运行一次
|
||||
workflow_dispatch: # 允许手动触发
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
# 这一步非常重要:赋予脚本写入仓库的权限
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.9'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Run RSS Script
|
||||
env:
|
||||
RSS_KEYWORDS: ${{ secrets.RSS_KEYWORDS }}
|
||||
run: python get_RSS.py
|
||||
|
||||
- name: Commit and Push changes
|
||||
run: |
|
||||
git config --global user.name 'github-actions[bot]'
|
||||
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
|
||||
git add filtered_feed.xml
|
||||
# 只有当有新文件生成或文件变动时才提交
|
||||
git commit -m "Auto-update RSS feed" || echo "No changes to commit"
|
||||
git push
|
||||
203
get_RSS.py
Normal file
203
get_RSS.py
Normal file
@@ -0,0 +1,203 @@
|
||||
import feedparser
|
||||
import re
|
||||
import os
|
||||
import datetime
|
||||
import time
|
||||
from rfeed import Item, Feed, Guid
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
# --- 配置区域 ---
|
||||
OUTPUT_FILE = "filtered_feed.xml" # 输出文件
|
||||
MAX_ITEMS = 200 # RSS中保留的最大条目数(滚动窗口)
|
||||
JOURNALS_FILE = 'journals.dat'
|
||||
KEYWORDS_FILE = 'keywords.dat'
|
||||
# ----------------
|
||||
|
||||
def load_config(filename, env_var_name=None):
|
||||
"""
|
||||
优先从环境变量读取配置(用于 GitHub Actions 保护隐私),
|
||||
如果环境变量不存在,则读取本地文件(用于本地测试)。
|
||||
"""
|
||||
# 1. 尝试读取环境变量 (Secrets)
|
||||
if env_var_name and os.environ.get(env_var_name):
|
||||
print(f"Loading config from environment variable: {env_var_name}")
|
||||
# 假设环境变量里用分号 ; 或者换行符分隔
|
||||
content = os.environ[env_var_name]
|
||||
# 兼容换行符或分号分隔
|
||||
if '\n' in content:
|
||||
return [line.strip() for line in content.split('\n') if line.strip()]
|
||||
else:
|
||||
return [line.strip() for line in content.split(';') if line.strip()]
|
||||
|
||||
# 2. 尝试读取本地文件
|
||||
if os.path.exists(filename):
|
||||
print(f"Loading config from local file: {filename}")
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
return [line.strip() for line in f if line.strip() and not line.startswith('#')]
|
||||
|
||||
print(f"Warning: No config found for {filename} or {env_var_name}")
|
||||
return []
|
||||
|
||||
def convert_struct_time_to_datetime(struct_time):
|
||||
"""将 feedparser 的时间结构转换为 datetime 对象"""
|
||||
if not struct_time:
|
||||
return datetime.datetime.now()
|
||||
return datetime.datetime.fromtimestamp(time.mktime(struct_time))
|
||||
|
||||
def parse_rss(rss_url, retries=3):
|
||||
"""解析在线 RSS 订阅"""
|
||||
print(f"Fetching: {rss_url}...")
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
feed = feedparser.parse(rss_url)
|
||||
entries = []
|
||||
journal_title = feed.feed.get('title', 'Unknown Journal')
|
||||
|
||||
for entry in feed.entries:
|
||||
# 获取标准时间
|
||||
pub_struct = entry.get('published_parsed', entry.get('updated_parsed'))
|
||||
pub_date = convert_struct_time_to_datetime(pub_struct)
|
||||
|
||||
entries.append({
|
||||
'title': entry.get('title', ''),
|
||||
'link': entry.get('link', ''),
|
||||
'pub_date': pub_date,
|
||||
'summary': entry.get('summary', entry.get('description', '')),
|
||||
'journal': journal_title,
|
||||
'id': entry.get('id', entry.get('link', '')) # ID 用于去重
|
||||
})
|
||||
return entries
|
||||
except Exception as e:
|
||||
print(f"Error parsing {rss_url}: {e}")
|
||||
time.sleep(2)
|
||||
return []
|
||||
|
||||
def get_existing_items():
|
||||
"""读取上一次生成的 XML 文件,保留历史数据"""
|
||||
if not os.path.exists(OUTPUT_FILE):
|
||||
return []
|
||||
|
||||
print(f"Loading existing items from {OUTPUT_FILE}...")
|
||||
try:
|
||||
# feedparser 也可以解析本地 XML 文件
|
||||
feed = feedparser.parse(OUTPUT_FILE)
|
||||
entries = []
|
||||
for entry in feed.entries:
|
||||
# 恢复 datetime 对象
|
||||
pub_struct = entry.get('published_parsed')
|
||||
pub_date = convert_struct_time_to_datetime(pub_struct)
|
||||
|
||||
# 注意:生成的 XML 标题通常是 "[Journal] Title",这里我们需要尽量保持原样
|
||||
# 或者为了简单起见,我们直接存储读取到的内容
|
||||
entries.append({
|
||||
'title': entry.get('title', ''), # 这里标题已经包含 [Journal] 前缀了
|
||||
'link': entry.get('link', ''),
|
||||
'pub_date': pub_date,
|
||||
'summary': entry.get('summary', ''),
|
||||
'journal': entry.get('author', ''), # 我们在生成时把 journal 存入了 author 字段
|
||||
'id': entry.get('id', entry.get('link', '')),
|
||||
'is_old': True # 标记为旧数据,不需要再次关键词匹配
|
||||
})
|
||||
return entries
|
||||
except Exception as e:
|
||||
print(f"Error reading existing file: {e}")
|
||||
return []
|
||||
|
||||
def match_entry(entry, queries):
|
||||
"""关键词匹配"""
|
||||
# 构造待搜索文本
|
||||
text_to_search = (entry['title'] + " " + entry['summary']).lower()
|
||||
|
||||
for query in queries:
|
||||
keywords = [k.strip().lower() for k in query.split('AND')]
|
||||
match = True
|
||||
for keyword in keywords:
|
||||
# 使用简单的字符串包含判断,比正则更快,且对科研关键词通常足够
|
||||
if keyword not in text_to_search:
|
||||
match = False
|
||||
break
|
||||
if match:
|
||||
return True
|
||||
return False
|
||||
|
||||
def generate_rss_xml(items):
|
||||
"""生成 RSS 2.0 XML 文件"""
|
||||
rss_items = []
|
||||
|
||||
# 按时间倒序排列(最新的在最前)
|
||||
# 确保所有 item 都有 pub_date 且是 datetime 对象
|
||||
items.sort(key=lambda x: x['pub_date'], reverse=True)
|
||||
|
||||
# 截取最新的 MAX_ITEMS 条
|
||||
items = items[:MAX_ITEMS]
|
||||
|
||||
for item in items:
|
||||
# 如果是旧数据,标题可能已经是 "[Journal] Title" 格式,需要避免重复添加前缀
|
||||
title = item['title']
|
||||
if not item.get('is_old', False):
|
||||
# 新数据,添加期刊前缀
|
||||
title = f"[{item['journal']}] {item['title']}"
|
||||
|
||||
rss_item = Item(
|
||||
title = title,
|
||||
link = item['link'],
|
||||
description = item['summary'],
|
||||
author = item['journal'], # 借用 author 字段存储期刊名
|
||||
guid = Guid(item['id']),
|
||||
pubDate = item['pub_date']
|
||||
)
|
||||
rss_items.append(rss_item)
|
||||
|
||||
feed = Feed(
|
||||
title = "My Customized Papers (Auto-Filtered)",
|
||||
link = "https://github.com/your_username/your_repo",
|
||||
description = "Aggregated research papers based on keywords",
|
||||
language = "en-US",
|
||||
lastBuildDate = datetime.datetime.now(),
|
||||
items = rss_items
|
||||
)
|
||||
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
f.write(feed.rss())
|
||||
print(f"Successfully generated {OUTPUT_FILE} with {len(rss_items)} items.")
|
||||
|
||||
def main():
|
||||
# 1. 读取配置
|
||||
rss_urls = load_config('journals.dat', 'RSS_JOURNALS')
|
||||
queries = load_config('keywords.dat', 'RSS_KEYWORDS')
|
||||
|
||||
if not rss_urls or not queries:
|
||||
print("Error: Configuration files are empty or missing.")
|
||||
return
|
||||
|
||||
# 2. 读取旧数据(核心去重策略:保留历史)
|
||||
existing_entries = get_existing_items()
|
||||
# 创建一个已有 ID 的集合,用于快速查重
|
||||
seen_ids = set(entry['id'] for entry in existing_entries)
|
||||
|
||||
all_entries = existing_entries.copy()
|
||||
new_count = 0
|
||||
|
||||
# 3. 抓取新数据
|
||||
print("Starting RSS fetch from remote...")
|
||||
for url in rss_urls:
|
||||
fetched_entries = parse_rss(url)
|
||||
for entry in fetched_entries:
|
||||
# 查重:如果 ID 已经在旧数据里,直接跳过
|
||||
if entry['id'] in seen_ids:
|
||||
continue
|
||||
|
||||
# 关键词匹配
|
||||
if match_entry(entry, queries):
|
||||
all_entries.append(entry)
|
||||
seen_ids.add(entry['id'])
|
||||
new_count += 1
|
||||
print(f"Match found: {entry['title'][:50]}...")
|
||||
|
||||
print(f"Added {new_count} new entries. Total entries before limit: {len(all_entries)}")
|
||||
|
||||
# 4. 生成新文件 (包含排序和截断)
|
||||
generate_rss_xml(all_entries)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
108
journals.dat
Normal file
108
journals.dat
Normal file
@@ -0,0 +1,108 @@
|
||||
feeds.aps.org/rss/recent/prmaterials.xml
|
||||
feeds.aps.org/rss/recent/prx.xml
|
||||
feeds.aps.org/rss/tocsec/PRL-CondensedMatterStructureetc.xml
|
||||
feeds.aps.org/rss/tocsec/PRB-Structurestructuralphasetransitionsmechanicalpropertiesdefects.xml
|
||||
feeds.aps.org/rss/tocsec/PRB-SemiconductorsIbulk.xml
|
||||
https://advanced.onlinelibrary.wiley.com/feed/15214095/most-recent
|
||||
https://advanced.onlinelibrary.wiley.com/feed/16163028/most-recent
|
||||
https://www.science.org/action/showFeed?type=etoc&feed=rss&jc=science
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=aelccp
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=ancac3
|
||||
https://onlinelibrary.wiley.com/feed/16146840/most-recent
|
||||
https://advanced.onlinelibrary.wiley.com/feed/16163028/most-recent
|
||||
https://advanced.onlinelibrary.wiley.com/feed/15214095/most-recent
|
||||
https://onlinelibrary.wiley.com/feed/15213773/most-recent
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jacsat
|
||||
https://rss.sciencedirect.com/publication/science/25424351
|
||||
https://rss.sciencedirect.com/publication/science/25902385
|
||||
https://rss.sciencedirect.com/publication/science/22112855
|
||||
https://www.nature.com/nature.rss
|
||||
https://www.nature.com/ncomms.rss
|
||||
https://www.nature.com/nenergy.rss
|
||||
https://www.nature.com/nmat.rss
|
||||
https://www.nature.com/nphys.rss
|
||||
https://www.nature.com/natrevmats.rss
|
||||
https://www.nature.com/npjcompumats.rss
|
||||
https://onlinelibrary.wiley.com/feed/25673173/most-recent
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jctcce
|
||||
https://pubs.aip.org/rss/site_1000011/1000008.xml
|
||||
https://pubs.aip.org/rss/site_1000009/1000007.xml
|
||||
https://pubs.aip.org/rss/site_1000017/1000011.xml
|
||||
https://pubs.aip.org/rss/site_1000017/LatestOpenIssueArticles_1000011.xml
|
||||
https://pubs.aip.org/rss/site_1000019/1000012.xml
|
||||
https://pubs.aip.org/rss/site_1000019/LatestOpenIssueArticles_1000012.xml
|
||||
https://pubs.aip.org/rss/site_1000013/1000009.xml
|
||||
https://pubs.aip.org/rss/site_1000013/LatestOpenIssueArticles_1000009.xml
|
||||
https://rss.sciencedirect.com/publication/science/13697021
|
||||
https://rss.sciencedirect.com/publication/science/25425293
|
||||
https://www.tandfonline.com/feed/rss/tadp20
|
||||
https://feeds.aps.org/rss/recent/rmp.xml
|
||||
https://feeds.aps.org/rss/recent/prl.xml
|
||||
https://feeds.aps.org/rss/tocsec/PRB-Dynamicsdynamicalsystemslatticeeffectsquantumsolids.xml
|
||||
https://feeds.aps.org/rss/recent/prb.xml
|
||||
https://www.nature.com/natrevphys.rss
|
||||
https://www.nature.com/nnano.rss
|
||||
https://rss.sciencedirect.com/publication/science/00796425
|
||||
https://advanced.onlinelibrary.wiley.com/feed/21983844/most-recent
|
||||
https://www.pnas.org/action/showFeed?type=etoc&feed=rss&jc=PNAS
|
||||
https://www.pnas.org/action/showFeed?type=searchTopic&taxonomyCode=topic&tagCode=phys-sci
|
||||
https://rss.arxiv.org/rss/cond-mat
|
||||
https://rss.sciencedirect.com/publication/science/20959273
|
||||
https://rss.sciencedirect.com/publication/science/26671417
|
||||
https://www.nature.com/commsmat.rss
|
||||
https://rss.sciencedirect.com/publication/science/29497477
|
||||
https://rss.sciencedirect.com/publication/science/09270256
|
||||
https://rss.sciencedirect.com/publication/science/01672738
|
||||
https://www.annualreviews.org/rss/content/journals/conmatphys/latestarticles?fmt=rss
|
||||
https://www.annualreviews.org/rss/content/journals/matsci/latestarticles?fmt=rss
|
||||
https://www.nature.com/natmachintell.rss
|
||||
https://rss.sciencedirect.com/publication/science/00796425
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jpclcd
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jpccck
|
||||
https://rss.sciencedirect.com/publication/science/2352152X
|
||||
https://rss.sciencedirect.com/publication/science/13590286
|
||||
http://feeds.rsc.org/rss/dd
|
||||
https://rss.sciencedirect.com/publication/science/23528478
|
||||
https://chemrxiv.org/engage/rss/chemrxiv
|
||||
https://advanced.onlinelibrary.wiley.com/feed/29439981/most-recent
|
||||
https://rss.sciencedirect.com/publication/science/30509130
|
||||
https://feeds.aps.org/rss/recent/prxenergy.xml
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=achre4
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=amrcda
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=amlcef
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=amacgu
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=apcach
|
||||
https://iopscience.iop.org/journal/rss/3050-287X
|
||||
https://ccs-stag.literatumonline.com/action/showFeed?type=etoc&feed=rss&jc=ccschem
|
||||
https://www.nature.com/natchemeng.rss
|
||||
http://feeds.rsc.org/rss/sc
|
||||
https://www.cell.com/iscience/current.rss
|
||||
https://www.cell.com/iscience/inpress.rss
|
||||
https://www.cell.com/Joule/current.rss
|
||||
https://www.cell.com/joule/inpress.rss
|
||||
https://www.cell.com/Matter/current.rss
|
||||
https://www.cell.com/matter/inpress.rss
|
||||
https://www.cell.com/Newton/current.rss
|
||||
https://www.cell.com/Newton/inpress.rss
|
||||
https://www.cell.com/Chem/current.rss
|
||||
https://www.cell.com/chem/inpress.rss
|
||||
https://www.cell.com/chem-catalysis/inpress.rss
|
||||
https://www.cell.com/chem-catalysis/current.rss
|
||||
https://www.cell.com/Cell-Reports-Physical-Science/inpress.rss
|
||||
https://www.cell.com/Cell-Reports-Physical-Science/current.rss
|
||||
https://onlinelibrary.wiley.com/feed/16136829/most-recent
|
||||
https://onlinelibrary.wiley.com/feed/23669608/most-recent
|
||||
https://onlinelibrary.wiley.com/feed/26379368/most-recent
|
||||
https://onlinelibrary.wiley.com/feed/16147065/most-recent
|
||||
https://onlinelibrary.wiley.com/feed/25750356/most-recent
|
||||
https://onlinelibrary.wiley.com/feed/25673165/most-recent
|
||||
https://onlinelibrary.wiley.com/feed/2688819x/most-recent
|
||||
https://onlinelibrary.wiley.com/feed/26884062/most-recent
|
||||
https://onlinelibrary.wiley.com/feed/26924552/most-recent
|
||||
https://pubs.acs.org/action/showFeed?type=axatoc&feed=rss&jc=jaaucr
|
||||
https://www.tandfonline.com/feed/rss/tmrl20
|
||||
https://rss.sciencedirect.com/publication/science/25425293
|
||||
https://rss.sciencedirect.com/publication/science/13596454
|
||||
https://rss.sciencedirect.com/publication/science/18722067
|
||||
https://rss.sciencedirect.com/publication/science/00219517
|
||||
https://www.nature.com/commsphys.rss
|
||||
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
feedparser
|
||||
rfeed
|
||||
Reference in New Issue
Block a user