Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
3183445
告警成功率阈值从硬编码50%移入配置文件,以支持自定义;相关日志调整
Apr 7, 2026
e8426e7
提高redis依赖兼容性:最大版本支持从<4.0提升到<6.0
Apr 7, 2026
f471fbf
提高redis依赖兼容性:最大版本支持从<4.0提升到<6.0
Apr 7, 2026
b1c5afa
文件下载爬虫第一次提交
Apr 7, 2026
5c0e78a
文件下载爬虫第二次提交,修复bug
Apr 7, 2026
edaeee4
文件下载爬虫第三次提交,修复细节bug
Apr 7, 2026
744f172
文件下载爬虫第四次提交,文档细节调整
Apr 7, 2026
0bfd88d
文件下载爬虫支持任务字段透传组装最终item;on_task_all_done方法签名修改。
Apr 8, 2026
e25915c
解决“文件下载爬虫支持任务字段透传组装最终item”的bug——第一次提交
Apr 8, 2026
5f4ddb4
解决“文件下载爬虫支持任务字段透传组装最终item”的bug——第一次提交
Apr 8, 2026
7162763
解决“文件下载爬虫支持任务字段透传组装最终item”的bug——第二次提交——修复竞态条件bug,新增skipped_count字段。
Apr 8, 2026
6c04fc0
解决“文件下载爬虫支持任务字段透传组装最终item”的bug——第四次提交——添加run_id标识,避免小概率情况下跨批次请求造成统计错误。
Apr 8, 2026
517042a
第五次提交:修复方法签名不一致问题,文档更新。
Apr 8, 2026
b1db0c9
第六次提交:修复方法签名不一致问题;选择mysql作为缓存时,按 redis_key分表,减少跨业务串扰。
Apr 8, 2026
b7a0835
Merge pull request #4 from gyj126/file-spider
gyj126 Apr 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
第六次提交:修复方法签名不一致问题;选择mysql作为缓存时,按 redis_key分表,减少跨业务串扰。
  • Loading branch information
gaoyunjian committed Apr 8, 2026
commit b1db0c9bf0b0ece98f3850d20f5e045bb4a76df0
42 changes: 41 additions & 1 deletion docs/usage/FileSpider.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,46 @@ class MyFileSpider(feapder.FileSpider):

## 4. 使用示例

### 启动方式(单进程 / master-worker 分离)

FileSpider 支持两种启动方式:

1. 单进程:`spider.start()`,适合本地调试
2. 分离运行:master 仅负责派发任务,worker 仅负责下载处理,适合生产部署

```python
from feapder import ArgumentParser

if __name__ == "__main__":
spider = MyFileSpider(
redis_key="my_file_spider",
task_table="file_task",
task_keys=["id", "file_urls"],
)

parser = ArgumentParser(description="MyFileSpider 文件下载爬虫")
parser.add_argument(
"--start_master",
action="store_true",
help="添加任务",
function=spider.start_monitor_task,
)
parser.add_argument(
"--start_worker",
action="store_true",
help="启动爬虫",
function=spider.start,
)
parser.start()
```

命令行启动:

```bash
uv run my_file_spider.py --start_master
uv run my_file_spider.py --start_worker
```

### 场景一:保存到本地磁盘

最简单的用法,下载文件保存到本地:
Expand Down Expand Up @@ -330,7 +370,7 @@ FileSpider 提供两级去重:
|------|--------|------|----------|
| 不去重 | `None`(默认) | - | 每次都重新下载 |
| Redis 去重 | `"redis"` | Redis Hash | 分布式共享,多进程安全 |
| MySQL 去重 | `"mysql"` | MySQL 表(自动建表) | 持久化,长期缓存 |
| MySQL 去重 | `"mysql"` | MySQL 表(按 `redis_key` 自动分表) | 持久化,隔离不同业务 |
| 自定义去重 | `FileDedup` 实例 | 用户自定义 | 特殊需求 |

### 自定义去重
Expand Down
5 changes: 4 additions & 1 deletion feapder/core/spiders/file_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import hashlib
import os
import re
import warnings
from urllib.parse import urlparse, unquote

Expand Down Expand Up @@ -125,7 +126,9 @@ def __init__(
elif file_dedup == "mysql":
if file_dedup_expire is not None:
log.warning("file_dedup_expire仅在file_dedup='redis'时生效")
self._file_dedup = MysqlFileDedup()
redis_namespace = re.sub(r"[^0-9a-zA-Z_]+", "_", self._redis_key).strip("_")
dedup_table = f"file_dedup_{redis_namespace}" if redis_namespace else "file_dedup_default"
self._file_dedup = MysqlFileDedup(table=dedup_table)
elif isinstance(file_dedup, FileDedup):
self._file_dedup = file_dedup
elif file_dedup is not None:
Expand Down
11 changes: 5 additions & 6 deletions feapder/templates/file_spider_template.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,9 @@ if __name__ == "__main__":

parser.start()

# 直接启动
# spider.start() # 启动爬虫
# spider.start_monitor_task() # 添加任务
# 启动方式一:单进程(调试方便)
# spider.start()

# 通过命令行启动
# python ${file_name} --start_master # 添加任务
# python ${file_name} --start_worker # 启动爬虫
# 启动方式二:分离 master/worker(生产推荐)
# uv run ${file_name} --start_master # 仅负责派发任务
# uv run ${file_name} --start_worker # 仅负责消费下载
24 changes: 21 additions & 3 deletions tests/file-spider/test_oss_result_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from urllib.parse import urlparse, unquote

import feapder
from feapder import ArgumentParser
from feapder.network.item import Item
from feapder.utils.log import log

Expand Down Expand Up @@ -65,10 +66,13 @@ def process_file(self, task_id, url, file_path, response):
# self.oss_client.put_object(file_path, response.content)
return f"https://my-bucket.oss.aliyuncs.com/{file_path}"

def on_task_all_done(self, task, result, success_count, fail_count, total_count):
def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count):
# result 与 get_download_urls 返回的列表严格位置对应
# 例: ["https://oss.com/a.jpg", "https://oss.com/b.jpg", None, "https://oss.com/d.jpg"]
log.info(f"任务{task.id} 完成 成功={success_count} 失败={fail_count}")
log.info(
f"任务{task.id} 完成 成功={success_count} 失败={fail_count} "
f"跳过={skipped_count} 去重={dup_count}"
)

# 组装结果 Item 写入结果表
item = FileResultItem()
Expand All @@ -89,4 +93,18 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count)
task_table="file_task",
task_keys=["id", "file_urls"],
)
spider.start_monitor_task()

parser = ArgumentParser(description="OssResultSpider 文件下载爬虫")
parser.add_argument(
"--start_master",
action="store_true",
help="添加任务",
function=spider.start_monitor_task,
)
parser.add_argument(
"--start_worker",
action="store_true",
help="启动爬虫",
function=spider.start,
)
parser.start()