-
Notifications
You must be signed in to change notification settings - Fork 128
/
Copy pathapi.py
127 lines (106 loc) · 3.72 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : 2021/3/17 21:51
# @Author : way
# @Site :
# @Describe: api 服务
import os
import time
import json
import psutil
import subprocess
import socket
import uvicorn
from uuid import uuid1
from fastapi import FastAPI
from typing import Optional
from SP.utils.ctrl_redis import RedisCtrl
app = FastAPI()
Host = socket.gethostbyname(socket.gethostname())
class Task:
def __init__(self, **kwargs):
self.id = kwargs.get('id')
self.spider = kwargs.get('spider')
self.cmd = kwargs.get('cmd')
self.pid = kwargs.get('pid')
self.host = Host
self.start = kwargs.get('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
class TaskRedis:
def __init__(self):
self.redis_key = 'tasks'
self.redisctrl = RedisCtrl()
def push(self, task):
val = json.dumps(task.__dict__, ensure_ascii=False)
self.redisctrl.add_hash(self.redis_key, task.id, val)
def remove(self, id):
self.redisctrl.remove_hash(self.redis_key, id)
def get_task(self, id):
return self.redisctrl.get_hash(self.redis_key, id)
def get_tasks(self, spider=None, sort='spider'):
tasks_map = {}
for val in self.redisctrl.get_hashall(self.redis_key).values():
task = Task(**json.loads(val))
if spider and spider != task.spider:
continue
sort_key = task.host if sort == 'host' else task.spider
tasks = tasks_map.get(sort_key, [])
tasks.append(task.__dict__)
tasks_map[sort_key] = tasks
return tasks_map
# 获取所有的爬虫名字
@app.get("/openapi/spiders")
async def spiders():
spiders = []
for file in os.listdir('SP/spiders'):
if file not in ('__init__.py', 'SPRedisSpider.py', '__pycache__'):
spiders.append(file.split('.')[0])
return {'total': len(spiders), 'spiders': spiders}
# 查看所有正在运行的爬虫进程
@app.get("/openapi/tasks")
async def tasks(spider: Optional[str] = None, sort: Optional[str] = 'spider'):
tasks = TaskRedis().get_tasks(spider, sort)
return {'total': len(tasks), 'tasks': tasks}
# 启动爬虫相关命令
@app.get("/openapi/run/{spider}")
def run(spider: str, cmd: Optional[str] = None):
"""cmd
生成任务 python SP_JOBS/spider_job.py --onlyjob true
执行爬虫 scrapy crawl spider
下载附件 python execute_download.py -s spider
"""
task = Task()
task.spider = spider
task.cmd = cmd
p = subprocess.Popen(task.cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
task.pid = p.pid
task.id = str(uuid1())
redis = TaskRedis()
redis.push(task)
stdout, stderr = p.communicate()
redis.remove(task.id)
try:
msg = stdout.decode('gbk') + stderr.decode('gbk')
except:
msg = stdout.decode() + stderr.decode()
end = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
return {'returncode': p.returncode, 'host': task.host, 'cmd': task.cmd, 'start': task.start, 'end': end, 'msg': msg}
# 停止爬虫进程
@app.get("/openapi/kill/{id}")
def kill(id: str):
redis = TaskRedis()
task = redis.get_task(id)
task = Task(**json.loads(task))
if task.host != Host:
return {'returncode': -1, 'msg': '非本机进程'}
else:
try:
p = psutil.Process(task.pid)
for son in p.children(recursive=True):
son.terminate()
p.terminate()
except Exception as e:
return {'returncode': -1, 'msg': str(e)}
redis.remove(task.id)
return {'returncode': 0, 'msg': 'success'}
if __name__ == '__main__':
uvicorn.run(app='api:app', host="127.0.0.1", port=2021, reload=True)