wwwlll
commited on
Commit
·
13bc594
1
Parent(s):
f63d19e
Add agent component for web crawler (#2878)
Browse files### What problem does this PR solve?
Add agent component for web crawler
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- agent/component/__init__.py +1 -0
- agent/component/crawler.py +71 -0
- web/src/assets/svg/crawler.svg +1 -0
- web/src/locales/en.ts +10 -0
- web/src/locales/zh-traditional.ts +9 -0
- web/src/locales/zh.ts +9 -0
- web/src/pages/flow/constant.tsx +12 -0
- web/src/pages/flow/flow-drawer/index.tsx +2 -0
- web/src/pages/flow/form/crawler-form/index.tsx +37 -0
agent/component/__init__.py
CHANGED
|
@@ -28,6 +28,7 @@ from .wencai import WenCai, WenCaiParam
|
|
| 28 |
from .jin10 import Jin10, Jin10Param
|
| 29 |
from .tushare import TuShare, TuShareParam
|
| 30 |
from .akshare import AkShare, AkShareParam
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def component_class(class_name):
|
|
|
|
| 28 |
from .jin10 import Jin10, Jin10Param
|
| 29 |
from .tushare import TuShare, TuShareParam
|
| 30 |
from .akshare import AkShare, AkShareParam
|
| 31 |
+
from .crawler import Crawler, CrawlerParam
|
| 32 |
|
| 33 |
|
| 34 |
def component_class(class_name):
|
agent/component/crawler.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
#
|
| 16 |
+
from abc import ABC
|
| 17 |
+
import asyncio
|
| 18 |
+
from crawl4ai import AsyncWebCrawler
|
| 19 |
+
from agent.component.base import ComponentBase, ComponentParamBase
|
| 20 |
+
|
| 21 |
+
class CrawlerParam(ComponentParamBase):
|
| 22 |
+
"""
|
| 23 |
+
Define the Crawler component parameters.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self):
|
| 27 |
+
super().__init__()
|
| 28 |
+
|
| 29 |
+
def check(self):
|
| 30 |
+
return True
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class Crawler(ComponentBase, ABC):
|
| 34 |
+
component_name = "Crawler"
|
| 35 |
+
|
| 36 |
+
def _run(self, history, **kwargs):
|
| 37 |
+
ans = self.get_input()
|
| 38 |
+
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
| 39 |
+
if not ans:
|
| 40 |
+
return Crawler.be_output("")
|
| 41 |
+
try:
|
| 42 |
+
result = asyncio.run(self.get_web(ans))
|
| 43 |
+
|
| 44 |
+
return Crawler.be_output(result)
|
| 45 |
+
|
| 46 |
+
except Exception as e:
|
| 47 |
+
return Crawler.be_output(f"An unexpected error occurred: {str(e)}")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
async def get_web(self, url):
|
| 51 |
+
proxy = self._param.proxy if self._param.proxy else None
|
| 52 |
+
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
|
| 53 |
+
result = await crawler.arun(
|
| 54 |
+
url=url,
|
| 55 |
+
bypass_cache=True
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
match self._param.extract_type:
|
| 59 |
+
case 'html':
|
| 60 |
+
return result.cleaned_html
|
| 61 |
+
case 'markdown':
|
| 62 |
+
return result.markdown
|
| 63 |
+
case 'content':
|
| 64 |
+
return result.extracted_content
|
| 65 |
+
case _:
|
| 66 |
+
return result.markdown
|
| 67 |
+
# print(result.markdown)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
web/src/assets/svg/crawler.svg
ADDED
|
|
web/src/locales/en.ts
CHANGED
|
@@ -928,6 +928,16 @@ The above is the content you need to summarize.`,
|
|
| 928 |
yahooFinance: 'YahooFinance',
|
| 929 |
yahooFinanceDescription:
|
| 930 |
'The component queries information about the company based on the provided ticker symbol.',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 931 |
info: 'Info',
|
| 932 |
history: 'History',
|
| 933 |
financials: 'Financials',
|
|
|
|
| 928 |
yahooFinance: 'YahooFinance',
|
| 929 |
yahooFinanceDescription:
|
| 930 |
'The component queries information about the company based on the provided ticker symbol.',
|
| 931 |
+
crawler: 'Web Crawler',
|
| 932 |
+
crawlerDescription:
|
| 933 |
+
'This component can be used to crawl HTML source code from a specified URL.',
|
| 934 |
+
proxy: 'Proxy',
|
| 935 |
+
crawlerResultOptions: {
|
| 936 |
+
html: 'Html',
|
| 937 |
+
markdown: 'Markdown',
|
| 938 |
+
content: 'Content',
|
| 939 |
+
},
|
| 940 |
+
extractType: 'extractType',
|
| 941 |
info: 'Info',
|
| 942 |
history: 'History',
|
| 943 |
financials: 'Financials',
|
web/src/locales/zh-traditional.ts
CHANGED
|
@@ -877,6 +877,15 @@ export default {
|
|
| 877 |
akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
|
| 878 |
yahooFinance: '雅虎財經',
|
| 879 |
yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 880 |
info: '訊息',
|
| 881 |
history: '歷史',
|
| 882 |
financials: '財務',
|
|
|
|
| 877 |
akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
|
| 878 |
yahooFinance: '雅虎財經',
|
| 879 |
yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
|
| 880 |
+
crawler: '網頁爬蟲',
|
| 881 |
+
crawlerDescription: '該組件可用於從指定url爬取HTML源碼。',
|
| 882 |
+
proxy: '代理',
|
| 883 |
+
crawlerResultOptions: {
|
| 884 |
+
html: 'Html',
|
| 885 |
+
markdown: 'Markdown',
|
| 886 |
+
content: '文本',
|
| 887 |
+
},
|
| 888 |
+
extractType: '提取類型',
|
| 889 |
info: '訊息',
|
| 890 |
history: '歷史',
|
| 891 |
financials: '財務',
|
web/src/locales/zh.ts
CHANGED
|
@@ -897,6 +897,15 @@ export default {
|
|
| 897 |
akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
|
| 898 |
yahooFinance: '雅虎财经',
|
| 899 |
yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 900 |
info: '信息',
|
| 901 |
history: '历史',
|
| 902 |
financials: '财务',
|
|
|
|
| 897 |
akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
|
| 898 |
yahooFinance: '雅虎财经',
|
| 899 |
yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
|
| 900 |
+
crawler: '网页爬虫',
|
| 901 |
+
crawlerDescription: '该组件可用于从指定url爬取html源码。',
|
| 902 |
+
proxy: '代理',
|
| 903 |
+
crawlerResultOptions: {
|
| 904 |
+
html: 'Html',
|
| 905 |
+
markdown: 'Markdown',
|
| 906 |
+
content: '文本',
|
| 907 |
+
},
|
| 908 |
+
extractType: '提取类型',
|
| 909 |
info: '信息',
|
| 910 |
history: '历史',
|
| 911 |
financials: '财务',
|
web/src/pages/flow/constant.tsx
CHANGED
|
@@ -4,6 +4,7 @@ import { ReactComponent as baiduFanyiIcon } from '@/assets/svg/baidu-fanyi.svg';
|
|
| 4 |
import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
|
| 5 |
import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
|
| 6 |
import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
|
|
|
|
| 7 |
import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
|
| 8 |
import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
|
| 9 |
import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
|
|
@@ -73,6 +74,7 @@ export enum Operator {
|
|
| 73 |
Concentrator = 'Concentrator',
|
| 74 |
TuShare = 'TuShare',
|
| 75 |
Note = 'Note',
|
|
|
|
| 76 |
}
|
| 77 |
|
| 78 |
export const CommonOperatorList = Object.values(Operator).filter(
|
|
@@ -110,6 +112,7 @@ export const operatorIconMap = {
|
|
| 110 |
[Operator.Concentrator]: ConcentratorIcon,
|
| 111 |
[Operator.TuShare]: TuShareIcon,
|
| 112 |
[Operator.Note]: NoteIcon,
|
|
|
|
| 113 |
};
|
| 114 |
|
| 115 |
export const operatorMap: Record<
|
|
@@ -233,6 +236,9 @@ export const operatorMap: Record<
|
|
| 233 |
},
|
| 234 |
[Operator.TuShare]: { backgroundColor: '#f8cfa0' },
|
| 235 |
[Operator.Note]: { backgroundColor: '#f8cfa0' },
|
|
|
|
|
|
|
|
|
|
| 236 |
};
|
| 237 |
|
| 238 |
export const componentMenuList = [
|
|
@@ -323,6 +329,9 @@ export const componentMenuList = [
|
|
| 323 |
{
|
| 324 |
name: Operator.TuShare,
|
| 325 |
},
|
|
|
|
|
|
|
|
|
|
| 326 |
];
|
| 327 |
|
| 328 |
export const initialRetrievalValues = {
|
|
@@ -572,6 +581,7 @@ export const RestrictedUpstreamMap = {
|
|
| 572 |
[Operator.Jin10]: [Operator.Begin],
|
| 573 |
[Operator.Concentrator]: [Operator.Begin],
|
| 574 |
[Operator.TuShare]: [Operator.Begin],
|
|
|
|
| 575 |
};
|
| 576 |
|
| 577 |
export const NodeMap = {
|
|
@@ -605,6 +615,7 @@ export const NodeMap = {
|
|
| 605 |
[Operator.Jin10]: 'ragNode',
|
| 606 |
[Operator.TuShare]: 'ragNode',
|
| 607 |
[Operator.Note]: 'noteNode',
|
|
|
|
| 608 |
};
|
| 609 |
|
| 610 |
export const LanguageOptions = [
|
|
@@ -2791,3 +2802,4 @@ export const TuShareSrcOptions = [
|
|
| 2791 |
'fenghuang',
|
| 2792 |
'jinrongjie',
|
| 2793 |
];
|
|
|
|
|
|
| 4 |
import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
|
| 5 |
import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
|
| 6 |
import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
|
| 7 |
+
import { ReactComponent as CrawlerIcon } from '@/assets/svg/crawler.svg';
|
| 8 |
import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
|
| 9 |
import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
|
| 10 |
import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
|
|
|
|
| 74 |
Concentrator = 'Concentrator',
|
| 75 |
TuShare = 'TuShare',
|
| 76 |
Note = 'Note',
|
| 77 |
+
Crawler = 'Crawler',
|
| 78 |
}
|
| 79 |
|
| 80 |
export const CommonOperatorList = Object.values(Operator).filter(
|
|
|
|
| 112 |
[Operator.Concentrator]: ConcentratorIcon,
|
| 113 |
[Operator.TuShare]: TuShareIcon,
|
| 114 |
[Operator.Note]: NoteIcon,
|
| 115 |
+
[Operator.Crawler]: CrawlerIcon,
|
| 116 |
};
|
| 117 |
|
| 118 |
export const operatorMap: Record<
|
|
|
|
| 236 |
},
|
| 237 |
[Operator.TuShare]: { backgroundColor: '#f8cfa0' },
|
| 238 |
[Operator.Note]: { backgroundColor: '#f8cfa0' },
|
| 239 |
+
[Operator.Crawler]: {
|
| 240 |
+
backgroundColor: '#dee0e2',
|
| 241 |
+
},
|
| 242 |
};
|
| 243 |
|
| 244 |
export const componentMenuList = [
|
|
|
|
| 329 |
{
|
| 330 |
name: Operator.TuShare,
|
| 331 |
},
|
| 332 |
+
{
|
| 333 |
+
name: Operator.Crawler,
|
| 334 |
+
},
|
| 335 |
];
|
| 336 |
|
| 337 |
export const initialRetrievalValues = {
|
|
|
|
| 581 |
[Operator.Jin10]: [Operator.Begin],
|
| 582 |
[Operator.Concentrator]: [Operator.Begin],
|
| 583 |
[Operator.TuShare]: [Operator.Begin],
|
| 584 |
+
[Operator.Crawler]: [Operator.Begin],
|
| 585 |
};
|
| 586 |
|
| 587 |
export const NodeMap = {
|
|
|
|
| 615 |
[Operator.Jin10]: 'ragNode',
|
| 616 |
[Operator.TuShare]: 'ragNode',
|
| 617 |
[Operator.Note]: 'noteNode',
|
| 618 |
+
[Operator.Crawler]: 'ragNode',
|
| 619 |
};
|
| 620 |
|
| 621 |
export const LanguageOptions = [
|
|
|
|
| 2802 |
'fenghuang',
|
| 2803 |
'jinrongjie',
|
| 2804 |
];
|
| 2805 |
+
export const CrawlerResultOptions = ['markdown', 'html', 'content'];
|
web/src/pages/flow/flow-drawer/index.tsx
CHANGED
|
@@ -12,6 +12,7 @@ import BaiduForm from '../form/baidu-form';
|
|
| 12 |
import BeginForm from '../form/begin-form';
|
| 13 |
import BingForm from '../form/bing-form';
|
| 14 |
import CategorizeForm from '../form/categorize-form';
|
|
|
|
| 15 |
import DeepLForm from '../form/deepl-form';
|
| 16 |
import DuckDuckGoForm from '../form/duckduckgo-form';
|
| 17 |
import ExeSQLForm from '../form/exesql-form';
|
|
@@ -70,6 +71,7 @@ const FormMap = {
|
|
| 70 |
[Operator.YahooFinance]: YahooFinanceForm,
|
| 71 |
[Operator.Jin10]: Jin10Form,
|
| 72 |
[Operator.TuShare]: TuShareForm,
|
|
|
|
| 73 |
};
|
| 74 |
|
| 75 |
const EmptyContent = () => <div>empty</div>;
|
|
|
|
| 12 |
import BeginForm from '../form/begin-form';
|
| 13 |
import BingForm from '../form/bing-form';
|
| 14 |
import CategorizeForm from '../form/categorize-form';
|
| 15 |
+
import CrawlerForm from '../form/crawler-form';
|
| 16 |
import DeepLForm from '../form/deepl-form';
|
| 17 |
import DuckDuckGoForm from '../form/duckduckgo-form';
|
| 18 |
import ExeSQLForm from '../form/exesql-form';
|
|
|
|
| 71 |
[Operator.YahooFinance]: YahooFinanceForm,
|
| 72 |
[Operator.Jin10]: Jin10Form,
|
| 73 |
[Operator.TuShare]: TuShareForm,
|
| 74 |
+
[Operator.Crawler]: CrawlerForm,
|
| 75 |
};
|
| 76 |
|
| 77 |
const EmptyContent = () => <div>empty</div>;
|
web/src/pages/flow/form/crawler-form/index.tsx
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useTranslate } from '@/hooks/common-hooks';
|
| 2 |
+
import { Form, Input, Select } from 'antd';
|
| 3 |
+
import { useMemo } from 'react';
|
| 4 |
+
import { CrawlerResultOptions } from '../../constant';
|
| 5 |
+
import { IOperatorForm } from '../../interface';
|
| 6 |
+
const CrawlerForm = ({ onValuesChange, form }: IOperatorForm) => {
|
| 7 |
+
const { t } = useTranslate('flow');
|
| 8 |
+
const crawlerResultOptions = useMemo(() => {
|
| 9 |
+
return CrawlerResultOptions.map((x) => ({
|
| 10 |
+
value: x,
|
| 11 |
+
label: t(`crawlerResultOptions.${x}`),
|
| 12 |
+
}));
|
| 13 |
+
}, [t]);
|
| 14 |
+
return (
|
| 15 |
+
<Form
|
| 16 |
+
name="basic"
|
| 17 |
+
labelCol={{ span: 6 }}
|
| 18 |
+
wrapperCol={{ span: 18 }}
|
| 19 |
+
autoComplete="off"
|
| 20 |
+
form={form}
|
| 21 |
+
onValuesChange={onValuesChange}
|
| 22 |
+
>
|
| 23 |
+
<Form.Item label={t('proxy')} name={'proxy'}>
|
| 24 |
+
<Input placeholder="like: http://127.0.0.1:8888"></Input>
|
| 25 |
+
</Form.Item>
|
| 26 |
+
<Form.Item
|
| 27 |
+
label={t('extractType')}
|
| 28 |
+
name={'extract_type'}
|
| 29 |
+
initialValue="markdown"
|
| 30 |
+
>
|
| 31 |
+
<Select options={crawlerResultOptions}></Select>
|
| 32 |
+
</Form.Item>
|
| 33 |
+
</Form>
|
| 34 |
+
);
|
| 35 |
+
};
|
| 36 |
+
|
| 37 |
+
export default CrawlerForm;
|