Compare commits

...

154 Commits

Author SHA1 Message Date
Naibo Wang
fc5aa8368b Update Readme 2025-04-16 13:27:41 +08:00
Naibo Wang
793f028a00 Update Readme 2025-04-16 13:15:58 +08:00
Naibo Wang
ae22977143 Update Readme 2025-04-16 13:12:57 +08:00
Naibo Wang
541b3c13d2
Update Readme.md 2025-04-16 13:09:32 +08:00
Naibo Wang
a6192b730c Update Readme 2025-03-25 16:37:51 +08:00
Naibo_Mac_M2
d39218f5fd Add IPWO 2025-03-18 17:33:24 +08:00
Naibo_Mac_M2
a94c45b36d Add IPWO 2025-03-18 17:32:08 +08:00
Naibo_Mac_M2
0e8aba6b51 Add IPWO 2025-03-18 17:03:32 +08:00
Naibo Wang
e42ad07d80 Update Readme 2025-03-05 11:26:10 +08:00
Naibo Wang
2f6344d00b Update Readme 2025-03-05 11:10:57 +08:00
Naibo Wang
bfa6c0de76 Update Readme 2025-03-05 11:10:15 +08:00
Naibo Wang
b590cc22c5 Change License 2025-02-17 21:12:19 +08:00
Naibo Wang
d69adacbd1 Change License 2025-02-17 21:11:59 +08:00
Naibo Wang
15654da7eb Change License 2025-02-17 21:11:08 +08:00
Naibo Wang
967f5b8033 Change License 2025-02-17 21:10:38 +08:00
Naibo Wang
aa419ee845 Update Readme 2025-02-11 17:04:18 +08:00
Naibo Wang
f005e48700 Update Readme 2025-02-11 17:02:58 +08:00
Naibo Wang
4e96ed7d50 Merge branch 'master' of https://github.com/NaiboWang/EasySpider 2025-02-02 11:34:54 +08:00
Naibo Wang
e3fecc8926 Update Readme 2025-02-02 11:33:28 +08:00
naibo
119cb99711 Screenshots zoom to the maximum size under headless mode 2025-01-08 12:02:36 +08:00
naibo
f43bdd236d Screenshot folder 2025-01-08 11:44:02 +08:00
naibo
56f0847500 Parameter name change for loopExecute 2025-01-07 23:12:26 +08:00
naibo
0df6cebd18 Update ISSUE_TEMPLATE.md 2025-01-06 16:57:49 +08:00
naibo
4b42f6300c Update ISSUE_TEMPLATE.md 2025-01-06 15:15:28 +08:00
naibo
2cf33794f1 Fix bug: when cannot find elements, switch back to the original handle instead of one of the first two handles 2025-01-06 13:26:59 +08:00
naibo
9efd3b6efe Merge branch 'master' of https://github.com/NaiboWang/EasySpider 2025-01-06 01:31:01 +08:00
naibo
ad956be10d Fix bug for the URL shown in the task list 2025-01-06 01:30:48 +08:00
naibo
01de17d471 Update Readme 2025-01-05 03:55:33 +08:00
naibo
333dcd3ff4 New way to open MacOS program 2025-01-03 01:59:18 +08:00
Naibo_Mac_M2
555f02815c Add first_time_run script for MacOS 2025-01-02 16:01:17 +08:00
Naibo_Mac_M2
34ed41110a New script for copying all code files to the Code folder 2025-01-02 14:50:14 +08:00
Naibo_Mac_M2
32459b622d New script for copying all code files to the Code folder 2025-01-02 14:49:56 +08:00
naibo
02cd8599b0 Optimize reading 2024-12-31 03:16:34 +08:00
naibo
2feede55db New way to show/hide toolkits 2024-12-31 02:52:48 +08:00
Naibo Wang
33dda444d7 Specified User Folder 2024-12-31 01:54:51 +08:00
Naibo Wang
d7ccb22d01 Specified User Folder 2024-12-31 01:31:40 +08:00
naibo
f7a842eed6 Version 0.6.3 2024-12-31 00:14:32 +08:00
Naibo Wang
ea6fb049f5
Merge pull request #647 from NaiboWang/dependabot/npm_and_yarn/Extension/manifest_v3/nanoid-3.3.8
Bump nanoid from 3.3.7 to 3.3.8 in /Extension/manifest_v3
2024-12-31 00:02:21 +08:00
Naibo Wang
5216ffba82
Merge pull request #648 from NaiboWang/dependabot/npm_and_yarn/ElectronJS/multi-6bc014718a
Bump path-to-regexp and express in /ElectronJS
2024-12-31 00:02:10 +08:00
dependabot[bot]
4f0851e361
Bump path-to-regexp and express in /ElectronJS
Bumps [path-to-regexp](https://github.com/pillarjs/path-to-regexp) to 0.1.12 and updates ancestor dependency [express](https://github.com/expressjs/express). These dependencies need to be updated together.


Updates `path-to-regexp` from 0.1.10 to 0.1.12
- [Release notes](https://github.com/pillarjs/path-to-regexp/releases)
- [Changelog](https://github.com/pillarjs/path-to-regexp/blob/master/History.md)
- [Commits](https://github.com/pillarjs/path-to-regexp/compare/v0.1.10...v0.1.12)

Updates `express` from 4.21.0 to 4.21.2
- [Release notes](https://github.com/expressjs/express/releases)
- [Changelog](https://github.com/expressjs/express/blob/4.21.2/History.md)
- [Commits](https://github.com/expressjs/express/compare/4.21.0...4.21.2)

---
updated-dependencies:
- dependency-name: path-to-regexp
  dependency-type: indirect
- dependency-name: express
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-12-30 16:00:41 +00:00
dependabot[bot]
7bb9d5a374
Bump nanoid from 3.3.7 to 3.3.8 in /Extension/manifest_v3
Bumps [nanoid](https://github.com/ai/nanoid) from 3.3.7 to 3.3.8.
- [Release notes](https://github.com/ai/nanoid/releases)
- [Changelog](https://github.com/ai/nanoid/blob/main/CHANGELOG.md)
- [Commits](https://github.com/ai/nanoid/compare/3.3.7...3.3.8)

---
updated-dependencies:
- dependency-name: nanoid
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-12-30 16:00:38 +00:00
naibo
c56e87120d Version 0.6.3 2024-12-30 23:59:33 +08:00
naibo
5180f47b70 Add llm and fl beta code 2024-12-24 00:14:35 +08:00
Naibo_Mac_M2
b4d7ddf5cb Fix bug of document empty because of html.parsestring function 2024-12-11 23:17:21 +08:00
Naibo_Mac_M2
2031b09297 Update Readme 2024-11-25 17:34:16 +08:00
Naibo Wang
cc9a8082da
Update README.md 2024-11-25 17:32:09 +08:00
Naibo Wang
3daf5e8c21
Update README.md 2024-11-25 17:30:36 +08:00
Naibo Wang
8f5d7a3a52
Update main.js about execute.bat name 2024-11-25 17:23:45 +08:00
Naibo_Mac_M2
ee4a077630 Update Readme 2024-11-22 18:36:34 +08:00
naibo
3fe6f42366 Update Readme 2024-11-21 02:18:50 +08:00
naibo
eb3b578745 Add #1 Github Trending Badge 2024-11-12 17:08:47 +08:00
Naibo Wang
4ca5333f8b
Merge pull request #597 from touero/master
Create github issue template file to get details of config
2024-11-08 16:04:16 +11:00
Naibo Wang
b50d4eae3f
Update ISSUE_TEMPLATE.md 2024-11-08 13:03:32 +08:00
Ensong Wei
998a1ddb19
fix: supplementary English in issue template file 2024-11-07 22:05:12 +08:00
touero
07563bc750 fix: format line 2024-11-05 17:15:41 +08:00
touero
7b5ccf4a78 feat: add github issue template file 2024-11-05 17:08:35 +08:00
Naibo Wang
209235de8d
Update Readme.md 2024-11-04 14:24:32 +11:00
naibo
72529c0675 Show detailed JavaScript Error 2024-10-18 17:02:11 +08:00
naibo
081c49357e Update Readme 2024-10-18 16:43:06 +08:00
Naibo Wang
b611ddb6cd
Update README.md 2024-10-15 13:42:48 +08:00
Naibo Wang
abfac8c342
Update README.md 2024-10-15 13:41:45 +08:00
naibo
951a39fff6 Update Readme for building Electron Program 2024-10-15 05:42:06 +08:00
naibo
6d3d10f7a7 Update Readme for building Electron Program 2024-10-15 05:39:51 +08:00
naibo
46b1959564 Update Readme for building Electron Program 2024-10-15 05:33:15 +08:00
naibo
e14896d7cd Update Readme for building Electron Program 2024-10-15 05:15:33 +08:00
naibo
450dfa1a77 Add ElectronJS package speedup solution for Machines in China 2024-10-14 03:40:13 +08:00
naibo
3b907ba382 Add ElectronJS package speedup solution for Machines in China 2024-10-14 03:14:19 +08:00
naibo
70dd90470f Add ElectronJS package speedup solution for Machines in China 2024-10-14 02:49:01 +08:00
naibo
cc8bb70715 RollBack vue plugin version 2024-09-18 16:07:43 +08:00
Naibo Wang
c5f1696f11
Merge pull request #556 from NaiboWang/dependabot/npm_and_yarn/ElectronJS/multi-9423f4c335
Bump body-parser and express in /ElectronJS
2024-09-18 16:02:12 +08:00
Naibo Wang
b987408fc2
Merge pull request #557 from NaiboWang/dependabot/npm_and_yarn/ElectronJS/multi-cf87d80143
Bump send and express in /ElectronJS
2024-09-18 16:02:01 +08:00
Naibo Wang
391f0ea99d
Merge pull request #554 from NaiboWang/dependabot/npm_and_yarn/ElectronJS/multi-d66d039ac5
Bump serve-static and express in /ElectronJS
2024-09-18 16:01:51 +08:00
Naibo Wang
a94b67a1f6
Merge pull request #553 from NaiboWang/dependabot/npm_and_yarn/ElectronJS/multi-1d234c620e
Bump path-to-regexp and express in /ElectronJS
2024-09-18 16:01:43 +08:00
Naibo Wang
54ef89aef7
Merge pull request #552 from NaiboWang/dependabot/npm_and_yarn/Extension/manifest_v3/multi-033fad549c
Bump vite and @vitejs/plugin-vue in /Extension/manifest_v3
2024-09-18 16:01:32 +08:00
dependabot[bot]
22a3b45f13
Bump body-parser and express in /ElectronJS
Bumps [body-parser](https://github.com/expressjs/body-parser) to 1.20.3 and updates ancestor dependency [express](https://github.com/expressjs/express). These dependencies need to be updated together.


Updates `body-parser` from 1.20.2 to 1.20.3
- [Release notes](https://github.com/expressjs/body-parser/releases)
- [Changelog](https://github.com/expressjs/body-parser/blob/master/HISTORY.md)
- [Commits](https://github.com/expressjs/body-parser/compare/1.20.2...1.20.3)

Updates `express` from 4.19.2 to 4.21.0
- [Release notes](https://github.com/expressjs/express/releases)
- [Changelog](https://github.com/expressjs/express/blob/4.21.0/History.md)
- [Commits](https://github.com/expressjs/express/compare/4.19.2...4.21.0)

---
updated-dependencies:
- dependency-name: body-parser
  dependency-type: indirect
- dependency-name: express
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-09-17 22:29:05 +00:00
dependabot[bot]
44bfb69a36
Bump send and express in /ElectronJS
Bumps [send](https://github.com/pillarjs/send) to 0.19.0 and updates ancestor dependency [express](https://github.com/expressjs/express). These dependencies need to be updated together.


Updates `send` from 0.18.0 to 0.19.0
- [Release notes](https://github.com/pillarjs/send/releases)
- [Changelog](https://github.com/pillarjs/send/blob/master/HISTORY.md)
- [Commits](https://github.com/pillarjs/send/compare/0.18.0...0.19.0)

Updates `express` from 4.19.2 to 4.21.0
- [Release notes](https://github.com/expressjs/express/releases)
- [Changelog](https://github.com/expressjs/express/blob/4.21.0/History.md)
- [Commits](https://github.com/expressjs/express/compare/4.19.2...4.21.0)

---
updated-dependencies:
- dependency-name: send
  dependency-type: indirect
- dependency-name: express
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-09-17 22:29:05 +00:00
dependabot[bot]
5c1207649d
Bump serve-static and express in /ElectronJS
Bumps [serve-static](https://github.com/expressjs/serve-static) to 1.16.2 and updates ancestor dependency [express](https://github.com/expressjs/express). These dependencies need to be updated together.


Updates `serve-static` from 1.15.0 to 1.16.2
- [Release notes](https://github.com/expressjs/serve-static/releases)
- [Changelog](https://github.com/expressjs/serve-static/blob/v1.16.2/HISTORY.md)
- [Commits](https://github.com/expressjs/serve-static/compare/v1.15.0...v1.16.2)

Updates `express` from 4.19.2 to 4.21.0
- [Release notes](https://github.com/expressjs/express/releases)
- [Changelog](https://github.com/expressjs/express/blob/4.21.0/History.md)
- [Commits](https://github.com/expressjs/express/compare/4.19.2...4.21.0)

---
updated-dependencies:
- dependency-name: serve-static
  dependency-type: indirect
- dependency-name: express
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-09-17 22:29:00 +00:00
dependabot[bot]
c967db3dac
Bump path-to-regexp and express in /ElectronJS
Bumps [path-to-regexp](https://github.com/pillarjs/path-to-regexp) to 0.1.10 and updates ancestor dependency [express](https://github.com/expressjs/express). These dependencies need to be updated together.


Updates `path-to-regexp` from 0.1.7 to 0.1.10
- [Release notes](https://github.com/pillarjs/path-to-regexp/releases)
- [Changelog](https://github.com/pillarjs/path-to-regexp/blob/master/History.md)
- [Commits](https://github.com/pillarjs/path-to-regexp/compare/v0.1.7...v0.1.10)

Updates `express` from 4.19.2 to 4.21.0
- [Release notes](https://github.com/expressjs/express/releases)
- [Changelog](https://github.com/expressjs/express/blob/4.21.0/History.md)
- [Commits](https://github.com/expressjs/express/compare/4.19.2...4.21.0)

---
updated-dependencies:
- dependency-name: path-to-regexp
  dependency-type: indirect
- dependency-name: express
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-09-17 22:28:58 +00:00
dependabot[bot]
baec9c4298
Bump vite and @vitejs/plugin-vue in /Extension/manifest_v3
Bumps [vite](https://github.com/vitejs/vite/tree/HEAD/packages/vite) to 5.4.6 and updates ancestor dependency [@vitejs/plugin-vue](https://github.com/vitejs/vite-plugin-vue/tree/HEAD/packages/plugin-vue). These dependencies need to be updated together.


Updates `vite` from 2.9.18 to 5.4.6
- [Release notes](https://github.com/vitejs/vite/releases)
- [Changelog](https://github.com/vitejs/vite/blob/v5.4.6/packages/vite/CHANGELOG.md)
- [Commits](https://github.com/vitejs/vite/commits/v5.4.6/packages/vite)

Updates `@vitejs/plugin-vue` from 1.10.2 to 5.1.3
- [Release notes](https://github.com/vitejs/vite-plugin-vue/releases)
- [Changelog](https://github.com/vitejs/vite-plugin-vue/blob/main/packages/plugin-vue/CHANGELOG.md)
- [Commits](https://github.com/vitejs/vite-plugin-vue/commits/plugin-vue@5.1.3/packages/plugin-vue)

---
updated-dependencies:
- dependency-name: vite
  dependency-type: indirect
- dependency-name: "@vitejs/plugin-vue"
  dependency-type: direct:development
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-09-17 20:30:57 +00:00
naibo
3e7abd6273 Update BrightData 2024-09-10 22:50:51 +08:00
naibo
32df9d5060 Update Poster 2024-09-09 20:35:59 +08:00
naibo
05c52f9dc8 Change 98IP to Koala-IP 2024-09-09 15:53:17 +08:00
naibo
7c4dafc002 Add First Sponsor 2024-08-27 23:52:23 +08:00
Naibo_Mac_M2
2afaf43162 MacOS Prompt to tasks 2024-08-24 17:42:31 +08:00
naibo
b79d92df1d Add 98IP 2024-08-22 23:06:23 +08:00
naibo
e4e1a1b095 Add 98IP 2024-08-22 22:19:49 +08:00
naibo
048dfb1f4b Add 98IP 2024-08-22 21:38:59 +08:00
naibo
1750481744 Add 98IP 2024-08-22 21:20:44 +08:00
naibo
3ead5e7312 Add 98IP 2024-08-22 19:16:18 +08:00
naibo
81957adb52 Add 98IP 2024-08-22 19:13:42 +08:00
naibo
dbad074565 Add 98IP 2024-08-22 13:54:35 +08:00
naibo
8342135b36 Add 98IP 2024-08-22 13:46:41 +08:00
naibo
e74915d94c Typo Fix 2024-08-21 11:57:08 +08:00
naibo
df62f710e3 Change F7 to F2 2024-08-21 11:40:21 +08:00
naibo
118241ba6d 修复任意文件读取漏洞 2024-08-10 17:32:01 +08:00
Naibo Wang
de47e8516a
Update Readme.md 2024-07-29 19:31:06 +08:00
naibo
d438e4b19d Add AD 2024-07-29 19:26:22 +08:00
naibo
0003041dab Add AD 2024-07-29 19:25:43 +08:00
naibo
ec3d9094bf Usage Example Section 2024-07-29 17:28:11 +08:00
naibo
629509a588 Change Download location 2024-07-29 17:26:52 +08:00
naibo
5e17563d11 New AD 2024-07-29 17:10:51 +08:00
naibo
5acafe7948 New AD 2024-07-29 16:54:51 +08:00
naibo
c25f80c175 New AD 2024-07-29 16:54:07 +08:00
naibo
ab88b33c74 New AD 2024-07-29 16:47:33 +08:00
Naibo Wang
7442e43be3
Linux64 new login shell 2024-07-13 22:30:08 +08:00
naibo
a0518412b0 New startup shell for sandbox 2024-07-13 22:27:29 +08:00
naibo
9ccb56aeae New startup shell for sandbox 2024-07-13 22:21:05 +08:00
naibo
3601ddb14d New startup shell for sandbox 2024-07-13 22:11:07 +08:00
Naibo Wang
728a5cb3ea
+x easy-spider.sh 2024-07-13 21:58:16 +08:00
naibo
46909e4866 New start shell for Ubuntu 24.04 2024-07-13 21:44:52 +08:00
naibo
072b6ad21e Bug fix for ... 2024-07-12 20:24:05 +08:00
naibo
bf320abf1a Update Complie and Debug Video Address 2024-07-12 19:39:46 +08:00
Naibo Wang
2d7c3c1323
Merge pull request #362 from touero/master
Dictionary's get replace catch exception in first three if case
2024-07-12 17:38:35 +08:00
Naibo Wang
c185e914e7
Update easyspider_executestage.py
skipCount from 1 to 0
2024-07-12 17:37:45 +08:00
naibo
7c0ab0e519 More XPaths Bug Fix 2024-07-12 17:12:45 +08:00
naibo
f50b08e9c4 MySQL Constant Bug Fix 2024-07-12 16:47:51 +08:00
Naibo Wang
ff7d82f4d0
Merge pull request #435 from NaiboWang/dependabot/npm_and_yarn/ElectronJS/ws-8.17.1
Bump ws from 8.14.2 to 8.17.1 in /ElectronJS
2024-06-19 10:48:45 +08:00
dependabot[bot]
944d968679
Bump ws from 8.14.2 to 8.17.1 in /ElectronJS
Bumps [ws](https://github.com/websockets/ws) from 8.14.2 to 8.17.1.
- [Release notes](https://github.com/websockets/ws/releases)
- [Commits](https://github.com/websockets/ws/compare/8.14.2...8.17.1)

---
updated-dependencies:
- dependency-name: ws
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-06-18 20:10:49 +00:00
Naibo Wang
9f1f152680
Merge pull request #431 from NaiboWang/dependabot/npm_and_yarn/ElectronJS/braces-3.0.3
Bump braces from 3.0.2 to 3.0.3 in /ElectronJS
2024-06-17 17:24:31 +08:00
dependabot[bot]
18321e4fee
Bump braces from 3.0.2 to 3.0.3 in /ElectronJS
Bumps [braces](https://github.com/micromatch/braces) from 3.0.2 to 3.0.3.
- [Changelog](https://github.com/micromatch/braces/blob/master/CHANGELOG.md)
- [Commits](https://github.com/micromatch/braces/compare/3.0.2...3.0.3)

---
updated-dependencies:
- dependency-name: braces
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-06-17 09:23:33 +00:00
Naibo Wang
b79bda9001
Merge pull request #425 from NaiboWang/dependabot/npm_and_yarn/Extension/manifest_v3/multi-6365d02c7e
Bump @grpc/grpc-js and firebase in /Extension/manifest_v3
2024-06-17 17:22:28 +08:00
dependabot[bot]
80bc210ff1
Bump @grpc/grpc-js and firebase in /Extension/manifest_v3
Bumps [@grpc/grpc-js](https://github.com/grpc/grpc-node) to 1.9.15 and updates ancestor dependency [firebase](https://github.com/firebase/firebase-js-sdk). These dependencies need to be updated together.


Updates `@grpc/grpc-js` from 1.7.3 to 1.9.15
- [Release notes](https://github.com/grpc/grpc-node/releases)
- [Commits](https://github.com/grpc/grpc-node/compare/@grpc/grpc-js@1.7.3...@grpc/grpc-js@1.9.15)

Updates `firebase` from 9.23.0 to 10.12.2
- [Release notes](https://github.com/firebase/firebase-js-sdk/releases)
- [Changelog](https://github.com/firebase/firebase-js-sdk/blob/master/CHANGELOG.md)
- [Commits](https://github.com/firebase/firebase-js-sdk/compare/firebase@9.23.0...firebase@10.12.2)

---
updated-dependencies:
- dependency-name: "@grpc/grpc-js"
  dependency-type: indirect
- dependency-name: firebase
  dependency-type: direct:development
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-06-11 00:25:40 +00:00
Naibo Wang
dbf7681518
Merge pull request #392 from NaiboWang/dependabot/pip/ExecuteStage/pymysql-1.1.1
Bump pymysql from 1.1.0 to 1.1.1 in /ExecuteStage
2024-05-22 18:22:18 +08:00
Naibo Wang
f18616e3ff
Merge pull request #391 from NaiboWang/dependabot/pip/ExecuteStage/requests-2.32.0
Bump requests from 2.31.0 to 2.32.0 in /ExecuteStage
2024-05-22 18:21:43 +08:00
dependabot[bot]
911ea02f3f
---
updated-dependencies:
- dependency-name: pymysql
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-05-21 19:54:31 +00:00
dependabot[bot]
22f86cf0f2
---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-05-21 05:25:55 +00:00
naibo
0285246337 Merge branch 'master' of https://github.com/NaiboWang/EasySpider 2024-05-16 02:12:46 +08:00
naibo
4fdce9a915 Update Stealth.min.js 2024-05-16 02:12:34 +08:00
touero
15aab7c0c5 fix: remove unnecessary variables make it more concise 2024-05-07 21:50:30 +08:00
touero
3ec64d2623 [fix] the rest if case from try catch turn to dict' get 2024-05-04 11:30:14 +08:00
touero
5582205204 fix: dictionary's get replace catch exception in first three if case 2024-04-28 00:04:33 +08:00
Naibo Wang
c272e5da86
Merge pull request #360 from touero/master
Fixing get data before if case in preprocess event loop
2024-04-27 22:39:09 +08:00
touero
52702d4eb3 fix: getting data before if case in preprocess event loop 2024-04-27 00:18:30 +08:00
Naibo Wang
a8e77b5e15
Merge pull request #359 from touero/master
Define constants using enumeration classes
2024-04-26 10:40:34 +08:00
touero
606de75577 fix: format string and using enum class defined constants Ⅱ 2024-04-25 23:58:29 +08:00
touero
76fd4bad55 fix: format string and using enum class defined constants 2024-04-25 23:44:44 +08:00
Naibo_Mac_M2
2860bc7b8c Fix wrong word 2024-04-25 22:09:22 +08:00
Naibo_Mac_M2
ebe8a56a6f Bug fix for Field[] 2024-04-25 21:56:23 +08:00
Naibo Wang
e086de2852
Merge pull request #356 from touero/master
Getting data by dictionary's 'get' and remove not necessary catching Exception
2024-04-25 00:20:29 +08:00
touero
c2d16e13c2 fix: get data by dictionary's 'get' and remove not necessary catching Exception 2024-04-24 23:57:47 +08:00
naibo
e43318f57a Bug fix for Excel Upload 2024-04-24 23:31:28 +08:00
naibo
7849707486 Bug fix for Local Server 2024-04-24 23:23:01 +08:00
naibo
b1632459ef Bug fix for OS Version 2024-04-24 23:12:03 +08:00
naibo
a2bd496e8e Change windows to Windows 2024-04-24 22:04:16 +08:00
naibo
9ed61c4f50 Remove force headless 2024-04-24 02:20:05 +08:00
naibo
c8b71835de Update Docker 2024-04-23 23:55:42 +08:00
naibo
0afa159c98 Update Only Server 2024-04-23 23:22:54 +08:00
naibo
3ba748b101 Update Readme 2024-04-23 22:26:31 +08:00
Naibo Wang
818d3e0ddc Docker Support 2024-04-23 22:19:49 +08:00
Naibo Wang
ad568af5f3 Docker Support 2024-04-23 21:55:45 +08:00
naibo
b2a6fd6b6b win32 2024-04-22 19:12:53 +08:00
Naibo_Mac_M2
960cf74de1 MacOS 2024-04-22 08:24:14 +08:00
Naibo Wang
fce97dec61 Linux 2024-04-22 07:44:19 +08:00
Naibo Wang
3ffd34d0fd Linux 2024-04-22 07:13:54 +08:00
109 changed files with 3410 additions and 2106 deletions

25
.github/ISSUE_TEMPLATE.md vendored Normal file
View File

@ -0,0 +1,25 @@
## 版本信息 Version Information
**EasySpider版本 EasySpider Version**:
**系统版本(架构) System Version (Architecture)**:
**浏览器版本 Browser Version**:
**安装方式 Installation method**:
## 问题描述 Issue Description
## 如何复现 Steps to Reproduce
## 示例任务文件 Example Task File
Windows和Linux版本的软件设计的任务文件在软件目录下的`tasks`文件夹中,文件名为任务列表中`任务的ID号.json`MacOS系统的任务文件目录请运行下面的命令打开tasks文件夹
The task file designed for the Windows and Linux versions of the software is in the `tasks` folder in the software directory, and the file name is `the ID number of the task.json` in the task list; the task file directory of the MacOS system is opened by running the following command:
```bash
cd /Users/$(whoami)/Library/Application\ Support/EasySpider/tasks
open .
```
请将任务文件直接以文件的方式粘贴到这里,不要截图和打开复制里面的内容。
Please paste the task file directly as a file here, do not take screenshots and open to copy the content.

2
.gitignore vendored
View File

@ -14,3 +14,5 @@ old_code/
*.tar.xz
*.zip
Data/
**/__pycache__/
**/.venv/

View File

@ -1,10 +1,10 @@
EasySpider_MacOS/easyspider_executestage
EasySpider_MacOS/easyspider_executestage_full
EasySpider_Linux64_x64/user_data
EasySpider_windows_x32/user_data
EasySpider_Windows_x32/user_data
EasySpider
EasySpider.app/
EasySpider_windows_x64/user_data
EasySpider_Windows_x64/user_data
*.tmp
*.tar.gz
*.7z*

View File

@ -5,9 +5,11 @@ import copy
import platform
import shutil
import string
import threading
# import undetected_chromedriver as uc
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
from constants import WriteMode, DataWriteMode, GraphOption
from myChrome import MyChrome
from threading import Thread, Event
from PIL import Image
@ -30,7 +32,6 @@ from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from pynput.keyboard import Key, Listener
from datetime import datetime
import io # 遇到错误退出时应执行的代码
import json
@ -75,10 +76,7 @@ class BrowserThread(Thread):
def __init__(self, browser_t, id, service, version, event, saveName, config, option):
Thread.__init__(self)
self.logs = io.StringIO()
try:
self.log = bool(service["recordLog"])
except:
self.log = True
self.log = bool(service.get("recordLog", True))
self.browser = browser_t
self.option = option
self.config = config
@ -86,22 +84,13 @@ class BrowserThread(Thread):
self.totalSteps = 0
self.id = id
self.event = event
try:
self.saveName = service["saveName"] # 保存文件的名字
except:
now = datetime.now()
# 将时间格式化为精确到秒的字符串
self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
now = datetime.now()
self.saveName = service.get("saveName", now.strftime("%Y_%m_%d_%H_%M_%S")) # 保存文件的名字
self.OUTPUT = ""
self.SAVED = False
self.BREAK = False
self.CONTINUE = False
try:
maximizeWindow = service["maximizeWindow"]
except:
maximizeWindow = 0
if maximizeWindow == 1:
self.browser.maximize_window()
self.browser.maximize_window() if service.get("maximizeWindow") == 1 else ...
# 名称设定
if saveName != "": # 命令行覆盖保存名称
self.saveName = saveName # 保存文件的名字
@ -112,19 +101,23 @@ class BrowserThread(Thread):
self.print_and_log("Save Name for task ID", id, "is:", self.saveName)
if not os.path.exists("Data/Task_" + str(id)):
os.mkdir("Data/Task_" + str(id))
if not os.path.exists("Data/Task_" + str(id) + "/" + self.saveName):
os.mkdir("Data/Task_" + str(id) + "/" +
self.saveName) # 创建保存文件夹用来保存截图
self.downloadFolder = "Data/Task_" + str(id) + "/" + self.saveName
if not os.path.exists(self.downloadFolder):
os.mkdir(self.downloadFolder) # 创建保存文件夹用来保存截图和文件
if not os.path.exists(self.downloadFolder + "/files"):
os.mkdir(self.downloadFolder + "/files")
if not os.path.exists(self.downloadFolder + "/images"):
os.mkdir(self.downloadFolder + "/images")
self.getDataStep = 0
self.startSteps = 0
try:
startFromExit = service["startFromExit"] # 从上次退出的步骤开始
if startFromExit == 1:
if service.get("startFromExit", 0) == 1:
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r',
encoding='utf-8-sig') as file_obj:
self.startSteps = int(file_obj.read()) # 读取已执行步数
except:
pass
except Exception as e:
self.print_and_log(f"读取steps.txt失败原因{str(e)}")
if self.startSteps != 0:
self.print_and_log("此模式下任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
self.startSteps, "条。")
@ -132,7 +125,7 @@ class BrowserThread(Thread):
"will start from the last step, before we already collected", self.startSteps, " items.")
else:
self.print_and_log("此模式下任务ID", self.id,
"将从头F开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
"将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
self.print_and_log("In this mode, task ID", self.id,
"will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
stealth_path = driver_path[:driver_path.find(
@ -140,78 +133,83 @@ class BrowserThread(Thread):
with open(stealth_path, 'r') as f:
js = f.read()
self.print_and_log("Loading stealth.min.js")
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': js}) # TMALL 反扒
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js}) # TMALL 反扒
self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
WebDriverWait(self.browser, 10)
self.browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(self.id))
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(self.id), self.saveName, "files")
self.paramss = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': path}}
self.browser.execute("send_command", self.paramss) # 下载地址改变
self.browser.execute("send_command", self.paramss) # 下载目录改变
self.monitor_event = threading.Event()
self.monitor_thread = threading.Thread(target=rename_downloaded_file, args=(path, self.monitor_event)) #path后面的逗号不能省略是元组固定写法
self.monitor_thread.start()
# self.browser.get('about:blank')
self.procedure = service["graph"] # 程序执行流程
try:
self.maxViewLength = service["maxViewLength"] # 最大显示长度
except:
self.maxViewLength = 15
try:
self.outputFormat = service["outputFormat"] # 输出格式
except:
self.outputFormat = "csv"
try:
self.task_version = service["version"] # 任务版本
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
pass
else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
if service["version"] != version:
self.print_and_log("版本不一致,请使用" +
service["version"] + "版本的EasySpider运行该任务")
self.print_and_log("Version not match, please use EasySpider " +
service["version"] + " to run this task!")
self.browser.quit()
sys.exit()
except: # 0.2.0版本没有version字段所以直接退出
self.maxViewLength = service.get("maxViewLength", 15) # 最大显示长度
self.outputFormat = service.get("outputFormat", "csv") # 输出格式
self.save_threshold = service.get("saveThreshold", 10) # 保存最低阈值
self.dataWriteMode = service.get("dataWriteMode", DataWriteMode.Append.value) # 数据写入模式1为追加2为覆盖3为重命名文件
self.task_version = service.get("version", "") # 任务版本
if not self.task_version:
self.print_and_log("版本不一致请使用v0.2.0版本的EasySpider运行该任务")
self.print_and_log(
"Version not match, please use EasySpider v0.2.0 to run this task!")
self.print_and_log("Version not match, please use EasySpider v0.2.0 to run this task!")
self.browser.quit()
sys.exit()
try:
self.save_threshold = service["saveThreshold"] # 保存最低阈值
except:
self.save_threshold = 10
try:
self.links = list(
filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
except:
if self.task_version >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
pass
elif self.task_version != version: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
self.print_and_log(f"版本不一致,请使用{self.task_version}版本的EasySpider运行该任务")
self.print_and_log(f"Version not match, please use EasySpider {self.task_version} to run this task!")
self.browser.quit()
sys.exit()
service_links = service.get("links")
if service_links:
self.links = list(filter(isnotnull, service_links.split("\n"))) # 要执行的link的列表
else:
self.links = list(filter(isnotnull, service["url"])) # 要执行的link
self.OUTPUT = [] # 采集的数据
try:
self.dataWriteMode = service["dataWriteMode"] # 数据写入模式1为追加2为覆盖
except:
self.dataWriteMode = 1
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx" or self.outputFormat == "json":
if self.dataWriteMode == 2 and os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
self.writeMode = 1 # 写入模式0为新建1为追加
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
if self.outputFormat in ["csv", "txt", "xlsx", "json"]:
if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
if self.dataWriteMode == DataWriteMode.Cover.value:
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
elif self.dataWriteMode == DataWriteMode.Rename.value:
i = 2
while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat):
i = i + 1
self.saveName = self.saveName + '_' + str(i)
self.print_and_log("文件已存在,已重命名为", self.saveName)
self.writeMode = WriteMode.Create.value # 写入模式0为新建1为追加
if self.outputFormat in ['csv', 'txt', 'xlsx']:
if not os.path.exists(f"Data/Task_{str(self.id)}/{self.saveName}.{self.outputFormat}"):
self.OUTPUT.append([]) # 添加表头
self.writeMode = 0
self.writeMode = WriteMode.Create.value
elif self.outputFormat == "json":
self.writeMode = 3 # JSON模式无需判断是否存在文件
self.writeMode = WriteMode.Json.value # JSON模式无需判断是否存在文件
elif self.outputFormat == "mysql":
self.mysql = myMySQL(config["mysql_config_path"])
self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2)
self.writeMode = 2
if self.writeMode == 0:
self.mysql.create_table(self.saveName, service["outputParameters"],
remove_if_exists=self.dataWriteMode == DataWriteMode.Cover.value)
self.writeMode = WriteMode.MySQL.value # MySQL模式
if self.writeMode == WriteMode.Create.value:
self.print_and_log("新建模式|Create Mode")
elif self.writeMode == 1:
elif self.writeMode == WriteMode.Append.value:
self.print_and_log("追加模式|Append Mode")
elif self.writeMode == 2:
elif self.writeMode == WriteMode.MySQL.value:
self.print_and_log("MySQL模式|MySQL Mode")
elif self.writeMode == 3:
elif self.writeMode == WriteMode.Json.value:
self.print_and_log("JSON模式|JSON Mode")
self.containJudge = service["containJudge"] # 是否含有判断语句
self.outputParameters = {}
self.service = service
@ -224,191 +222,140 @@ class BrowserThread(Thread):
if param["name"] not in self.outputParameters.keys():
self.outputParameters[param["name"]] = ""
self.dataNotFoundKeys[param["name"]] = False
try:
self.outputParametersTypes.append(param["type"])
except:
self.outputParametersTypes.append("text")
try:
self.outputParametersRecord.append(
bool(param["recordASField"]))
except:
self.outputParametersRecord.append(True)
self.outputParametersTypes.append(param.get("type", "text"))
self.outputParametersRecord.append(bool(param.get("recordASField", True)))
# 文件叠加的时候不添加表头
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
if self.writeMode == 0:
self.OUTPUT[0].append(param["name"])
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create.value:
self.OUTPUT[0].append(param["name"])
self.urlId = 0 # 全局记录变量
self.preprocess() # 预处理,优化提取数据流程
try:
self.inputExcel = service["inputExcel"] # 输入Excel
except:
self.inputExcel = ""
self.inputExcel = service.get("inputExcel", "") # 输入Excel
self.readFromExcel() # 读取Excel获得参数值
# 检测如果没有复杂的操作,优化提取数据流程
def preprocess(self):
for node in self.procedure:
try:
iframe = node["parameters"]["iframe"]
except:
node["parameters"]["iframe"] = False
for index_node, node in enumerate(self.procedure):
parameters: dict = node["parameters"]
iframe = parameters.get('iframe')
option = node["option"]
try:
node["parameters"]["xpath"] = lowercase_tags_in_xpath(
node["parameters"]["xpath"])
except:
pass
try:
node["parameters"]["waitElementIframeIndex"] = int(
node["parameters"]["waitElementIframeIndex"])
except:
node["parameters"]["waitElement"] = ""
node["parameters"]["waitElementTime"] = 10
node["parameters"]["waitElementIframeIndex"] = 0
if node["option"] == 1: # 打开网页操作
try:
cookies = node["parameters"]["cookies"]
except:
node["parameters"]["cookies"] = ""
elif node["option"] == 2: # 点击操作
try:
alertHandleType = node["parameters"]["alertHandleType"]
except:
node["parameters"]["alertHandleType"] = 0
if node["parameters"]["useLoop"]:
parameters["iframe"] = False if not iframe else parameters.get('iframe', False)
if parameters.get("xpath"):
parameters["xpath"] = lowercase_tags_in_xpath(parameters["xpath"])
if parameters.get("waitElementIframeIndex"):
parameters["waitElementIframeIndex"] = int(parameters["waitElementIframeIndex"])
else:
parameters["waitElement"] = ""
parameters["waitElementTime"] = 10
parameters["waitElementIframeIndex"] = 0
if option == GraphOption.Get.value: # 打开网页操作
parameters["cookies"] = parameters.get("cookies", "")
elif option == GraphOption.Click.value: # 点击操作
parameters["alertHandleType"] = parameters.get("alertHandleType", 0)
if parameters.get("useLoop"):
if self.task_version <= "0.3.5":
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
node["parameters"]["xpath"] = ""
self.print_and_log("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif node["option"] == 3: # 提取数据操作
node["parameters"]["recordASField"] = 0
try:
params = node["parameters"]["params"]
except:
node["parameters"]["params"] = node["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
params = node["parameters"]["params"]
try:
clear = node["parameters"]["clear"]
except:
node["parameters"]["clear"] = 0
try:
newLine = node["parameters"]["newLine"]
except:
node["parameters"]["newLine"] = 1
parameters["xpath"] = ""
self.print_and_log(f"您的任务版本号为{self.task_version}循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif option == GraphOption.Extract.value: # 提取数据操作
parameters["recordASField"] = 0
parameters["params"] = parameters.get("params", parameters.get("paras")) # 兼容0.5.0及以下版本的EasySpider
parameters["clear"] = parameters.get("clear", 0)
parameters["newLine"] = parameters.get("newLine", 1)
params = parameters["params"]
for param in params:
try:
iframe = param["iframe"]
except:
param["iframe"] = False
try:
param["iframe"] = param.get("iframe", False)
if param.get("relativeXPath"):
param["relativeXPath"] = lowercase_tags_in_xpath(param["relativeXPath"])
except:
pass
try:
node["parameters"]["recordASField"] = param["recordASField"]
except:
node["parameters"]["recordASField"] = 1
try:
splitLine = int(param["splitLine"])
except:
param["splitLine"] = 0
if param["contentType"] == 8:
self.print_and_log(
"默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片保存下来然后用自定义操作调用自己写的程序程序的功能是读取这个最新生成的图片然后用好用的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。")
self.print_and_log(
"If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
parameters["recordASField"] = param.get("recordASField", 1)
param["splitLine"] = 0 if not param.get("splitLine") else param.get("splitLine")
if param.get("contentType") == 8:
self.print_and_log("默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType =="
"8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片"
"保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用"
"的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。")
self.print_and_log("If you think the default ddddocr function is not good enough, you can "
"modify the source code get_content function -> contentType == 8 position "
"to your own OCR model and then compile and run it; or you can first set "
"the content type of the crawler to \"Element Screenshot\" to save the "
"picture, and then call your own program with custom operations. The "
"function of the program is to read the latest generated picture, then use "
"a good model, such as PaddleOCR to recognize the picture, and then return "
"the return value as a parameter output to the program.")
param["optimizable"] = detect_optimizable(param)
elif node["option"] == 4: # 输入文字
try:
index = node["parameters"]["index"] # 索引值
except:
node["parameters"]["index"] = 0
elif node["option"] == 5: # 自定义操作
try:
clear = node["parameters"]["clear"]
except:
node["parameters"]["clear"] = 0
try:
newLine = node["parameters"]["newLine"]
except:
node["parameters"]["newLine"] = 1
elif node["option"] == 7: # 移动到元素
if node["parameters"]["useLoop"]:
if self.task_version <= "0.3.5":
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
node["parameters"]["xpath"] = ""
self.print_and_log("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif node["option"] == 8: # 循环操作
try:
exitElement = node["parameters"]["exitElement"]
if exitElement == "":
node["parameters"]["exitElement"] = "//body"
except:
node["parameters"]["exitElement"] = "//body"
node["parameters"]["quickExtractable"] = False # 是否可以快速提取
try:
skipCount = node["parameters"]["skipCount"]
except:
node["parameters"]["skipCount"] = 0
elif option == GraphOption.Input.value: # 输入文字
parameters['index'] = parameters.get('index', 0)
elif option == GraphOption.Custom.value: # 自定义操作
parameters['clear'] = parameters.get('clear', 0)
parameters['newLine'] = parameters.get('newLine', 1)
elif option == GraphOption.Move.value: # 移动到元素
if parameters.get('useLoop'):
if self.task_version <= "0.3.5": # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
parameters["xpath"] = ""
self.print_and_log(f"您的任务版本号为{self.task_version}循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif option == GraphOption.Loop.value: # 循环操作
parameters['exitElement'] = "//body" if not parameters.get('exitElement') or parameters.get('exitElement') == "" else parameters.get('exitElement')
parameters["quickExtractable"] = False # 是否可以快速提取
parameters['skipCount'] = parameters.get('skipCount', 0)
# 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
try:
params = self.procedure[node["sequence"][0]]["parameters"]["params"]
except:
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
try:
waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"]
except:
waitElement = ""
if node["parameters"]["iframe"]:
node["parameters"]["quickExtractable"] = False # 如果是iframe那么不可以快速提取
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 \
and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
params = self.procedure[node["sequence"][0]].get("parameters").get("params")
if not params:
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
waitElement = self.procedure[node["sequence"][0]]["parameters"].get("waitElement", "")
if parameters["iframe"]:
parameters["quickExtractable"] = False # 如果是iframe那么不可以快速提取
else:
node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
if node["parameters"]["skipCount"] > 0:
node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
parameters["quickExtractable"] = True # 先假设可以快速提取
if parameters["skipCount"] > 0:
parameters["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
for param in params:
optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
try:
iframe = param["iframe"]
except:
param["iframe"] = False
if param["iframe"] and not param["relative"]: # 如果是iframe那么不可以快速提取
param['iframe'] = param.get('iframe', False)
if param["iframe"] and not param["relative"]: # 如果是iframe那么不可以快速提取
optimizable = False
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
node["parameters"]["quickExtractable"] = False
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
parameters["quickExtractable"] = False
break
if node["parameters"]["quickExtractable"]:
self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据")
self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly")
try:
node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"]
except:
node["parameters"]["clear"] = 0
try:
node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"]
except:
node["parameters"]["newLine"] = 1
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
if parameters["quickExtractable"]:
self.print_and_log(f"循环操作<{node['title']}>可以快速提取数据")
self.print_and_log(f"Loop operation <{node['title']}> can extract data quickly")
parameters["clear"] = self.procedure[node["sequence"][0]]["parameters"].get("clear", 0)
parameters["newLine"] = self.procedure[node["sequence"][0]]["parameters"].get("newLine", 1)
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
node["parameters"]["baseXPath"] = node["parameters"]["xpath"]
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
node["parameters"]["baseXPath"] = node["parameters"]["pathList"]
node["parameters"]["quickParams"] = []
for param in params:
content_type = ""
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 or param["relativeXPath"].find(
"::text()") >= 0:
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 \
or param["relativeXPath"].find("::text()") >= 0:
content_type = ""
elif param["nodeType"] == 2:
content_type = "//@href"
elif param["nodeType"] == 4: # 图片链接
elif param["nodeType"] == 4: # 图片链接
content_type = "//@src"
elif param["contentType"] == 1:
content_type = "/text()"
elif param["contentType"] == 0:
content_type = "//text()"
if param["relative"]: # 如果是相对XPath
if param["relative"]: # 如果是相对XPath
xpath = "." + param["relativeXPath"] + content_type
else:
xpath = param["relativeXPath"] + content_type
@ -422,6 +369,7 @@ class BrowserThread(Thread):
"nodeType": param["nodeType"],
"default": param["default"],
})
self.procedure[index_node]["parameters"] = parameters
self.print_and_log("预处理完成|Preprocess completed")
def readFromExcel(self):
@ -521,7 +469,7 @@ class BrowserThread(Thread):
"/", len(self.links))
self.executeNode(0)
self.urlId = self.urlId + 1
files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
# files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
# 如果目录为空,则删除该目录
# if not files:
# os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
@ -538,12 +486,16 @@ class BrowserThread(Thread):
self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
time.sleep(quitWaitTime)
self.browser.quit()
try:
self.browser.quit()
except:
pass
self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
try:
shutil.rmtree(self.option["tmp_user_data_folder"])
except:
pass
self.monitor_event.set()
self.print_and_log("清理完成!|Clean up completed!")
self.print_and_log("您现在可以安全的关闭此窗口了。|You can safely close this window now.")
@ -753,28 +705,32 @@ class BrowserThread(Thread):
self.browser.set_script_timeout(max_wait_time)
try:
output = self.browser.execute_script(code)
except:
except Exception as e:
output = ""
self.recordLog("JavaScript execution failed")
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
elif int(codeMode) == 2:
self.recordLog("Execute JavaScript for element:" + code)
self.recordLog("对元素执行JavaScript:" + code)
self.browser.set_script_timeout(max_wait_time)
try:
output = self.browser.execute_script(code, element)
except:
except Exception as e:
output = ""
self.recordLog("JavaScript execution failed")
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
elif int(codeMode) == 5:
try:
code = readCode(code)
# global_namespace = globals().copy()
# global_namespace["self"] = self
output = exec(code)
self.recordLog("执行下面的代码:" + code)
self.recordLog("Execute the following code:" + code)
except Exception as e:
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", e)
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
self.print_and_log("Error executing the following code:" +
code, ", error is:", e)
code, ", error is:", str(e))
elif int(codeMode) == 6:
try:
code = readCode(code)
@ -847,6 +803,23 @@ class BrowserThread(Thread):
self.print_and_log("根据设置的自定义操作,任务已刷新页面|Task refreshed page according to custom operation")
elif codeMode == 9: # 发送邮件
send_email(node["parameters"]["emailConfig"])
elif codeMode == 10: # 清空所有字段值
self.clearOutputParameters()
elif codeMode == 11: # 生成新的数据行
line = new_line(self.outputParameters,
self.maxViewLength, self.outputParametersRecord)
self.OUTPUT.append(line)
elif codeMode == 12: # 退出程序
self.print_and_log("根据设置的自定义操作,任务已退出|Task exited according to custom operation")
self.saveData(exit=True)
self.browser.quit()
self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
try:
shutil.rmtree(self.option["tmp_user_data_folder"])
except:
pass
self.print_and_log("清理完成!|Clean up completed!")
os._exit(0)
else: # 0 1 5 6
output = self.execute_code(
codeMode, code, max_wait_time, iframe=params["iframe"])
@ -1106,7 +1079,25 @@ class BrowserThread(Thread):
self.recordLog(
"判断条件内所有条件分支的条件都不满足|None of the conditions in the judgment condition are met")
def handleHistory(self, node, xpath, thisHistoryURL, thisHistoryLength, index, element=None, elements=None):
def handleHistory(self, node, xpath, thisHandle, thisHistoryURL, thisHistoryLength, index, element=None, elements=None):
try:
changed_handle = self.browser.current_window_handle != thisHandle
except: # 如果网页被意外关闭了的情况下
self.browser.switch_to.window(
self.browser.window_handles[-1])
changed_handle = self.browser.window_handles[-1] != thisHandle
if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
try:
while True: # 一直关闭窗口直到当前标签页
self.browser.close() # 关闭使用完的标签页
self.browser.switch_to.window(
self.browser.window_handles[-1])
if self.browser.current_window_handle == thisHandle:
break
except Exception as e:
self.print_and_log("关闭标签页发生错误:", e)
self.print_and_log(
"Error occurred while closing tab: ", e)
if self.history["index"] != thisHistoryLength and self.history["handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
difference = thisHistoryLength - self.history["index"] # 计算历史记录变化差值
self.browser.execute_script('history.go(' + str(difference) + ')') # 回退历史记录
@ -1132,12 +1123,13 @@ class BrowserThread(Thread):
if self.browser.current_url == thisHistoryURL or ti > thisHistoryLength: # 如果执行完一次循环之后网址发生了变化
break
time.sleep(2)
if element == None: # 不固定元素列表
element = self.browser.find_elements(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
else: # 固定元素列表
element = self.browser.find_element(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
# if index > 0:
# index -= 1 # 如果是data:开头的网址,就要重试一次
if xpath != "":
if element == None: # 不固定元素列表
element = self.browser.find_elements(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
else: # 固定元素列表
element = self.browser.find_element(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
# if index > 0:
# index -= 1 # 如果是data:开头的网址,就要重试一次
else:
if element == None:
element = elements
@ -1156,6 +1148,14 @@ class BrowserThread(Thread):
self.history["handle"] = thisHandle
thisHistoryURL = self.browser.current_url
# 快速提取处理
# start = time.time()
try:
tree = html.fromstring(self.browser.page_source)
except Exception as e:
self.print_and_log("解析页面时出错,将切换普通提取模式|Error parsing page, will switch to normal extraction mode")
node["parameters"]["quickExtractable"] = False
# end = time.time()
# print("解析页面秒数:", end - start)
if node["parameters"]["quickExtractable"]:
self.browser.switch_to.default_content() # 切换到主页面
tree = html.fromstring(self.browser.page_source)
@ -1321,25 +1321,7 @@ class BrowserThread(Thread):
if self.BREAK:
self.BREAK = False
break
try:
changed_handle = self.browser.current_window_handle != thisHandle
except: # 如果网页被意外关闭了的情况下
self.browser.switch_to.window(
self.browser.window_handles[-1])
changed_handle = self.browser.window_handles[-1] != thisHandle
if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
try:
while True: # 一直关闭窗口直到当前标签页
self.browser.close() # 关闭使用完的标签页
self.browser.switch_to.window(
self.browser.window_handles[-1])
if self.browser.current_window_handle == thisHandle:
break
except Exception as e:
self.print_and_log("关闭标签页发生错误:", e)
self.print_and_log(
"Error occurred while closing tab: ", e)
index, elements = self.handleHistory(node, xpath, thisHistoryURL, thisHistoryLength, index, elements=elements)
index, elements = self.handleHistory(node, xpath, thisHandle, thisHistoryURL, thisHistoryLength, index, elements=elements)
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
@ -1381,25 +1363,7 @@ class BrowserThread(Thread):
if self.BREAK:
self.BREAK = False
break
try:
changed_handle = self.browser.current_window_handle != thisHandle
except: # 如果网页被意外关闭了的情况下
self.browser.switch_to.window(
self.browser.window_handles[-1])
changed_handle = self.browser.window_handles[-1] != thisHandle
if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
try:
while True: # 一直关闭窗口直到当前标签页
self.browser.close() # 关闭使用完的标签页
self.browser.switch_to.window(
self.browser.window_handles[-1])
if self.browser.current_window_handle == thisHandle:
break
except Exception as e:
self.print_and_log("关闭标签页发生错误:", e)
self.print_and_log(
"Error occurred while closing tab: ", e)
index, element = self.handleHistory(node, path, thisHistoryURL, thisHistoryLength, index, element=element)
index, element = self.handleHistory(node, path, thisHandle, thisHistoryURL, thisHistoryLength, index, element=element)
except NoSuchElementException:
self.print_and_log("Loop element not found: ", path)
self.print_and_log("找不到循环元素:", path)
@ -1447,6 +1411,7 @@ class BrowserThread(Thread):
code = get_output_code(output)
if code <= 0:
break
index, _ = self.handleHistory(node, "", thisHandle, thisHistoryURL, thisHistoryLength, index)
elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
# tempList = node["parameters"]["textList"].split("\r\n")
urlList = list(
@ -1696,8 +1661,11 @@ class BrowserThread(Thread):
try:
actions = ActionChains(self.browser) # 实例化一个action对象
if newTab == 1: # 在新标签页打开
# Ctrl + Click
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
if sys.platform == "darwin": # Mac
actions.key_down(Keys.COMMAND).click(element).key_up(Keys.COMMAND).perform()
else:
# Ctrl + Click
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
else:
actions.click(element).perform()
except Exception as e:
@ -1715,6 +1683,21 @@ class BrowserThread(Thread):
script = 'var result = document.evaluate(`' + path + \
'`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
self.browser.execute_script(script, str(index)) # 用js的点击方法
elif click_way == 2: # 双击
try:
actions = ActionChains(self.browser) # 实例化一个action对象
actions.double_click(element).perform()
except Exception as e:
self.browser.execute_script("arguments[0].scrollIntoView();", element)
try:
actions = ActionChains(self.browser) # 实例化一个action对象
actions.double_click(element).perform()
except Exception as e:
self.print_and_log(f"Selenium双击元素{path}失败将尝试使用JavaScript双击")
self.print_and_log(f"Failed to double click element {path} with Selenium, will try to double click with JavaScript")
script = 'var result = document.evaluate(`' + path + \
'`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
self.browser.execute_script(script, str(index)) # 用js的点击方法
self.recordLog("点击元素|Click element: " + path)
except TimeoutException:
self.print_and_log(
@ -1797,7 +1780,6 @@ class BrowserThread(Thread):
self.print_and_log("History Length Error")
self.history["index"] = 0
self.scrollDown(param) # 根据参数配置向下滚动
# rt.end()
def get_content(self, p, element):
content = ""
@ -1824,7 +1806,7 @@ class BrowserThread(Thread):
downloadPic = 0
if downloadPic == 1:
download_image(self, content, "Data/Task_" +
str(self.id) + "/" + self.saveName + "/", element)
str(self.id) + "/" + self.saveName + "/images", element)
else: # 普通节点
if p["splitLine"] == 1:
text = extract_text_from_html(element.get_attribute('outerHTML'))
@ -1853,7 +1835,7 @@ class BrowserThread(Thread):
downloadPic = 0
if downloadPic == 1:
download_image(self, content, "Data/Task_" +
str(self.id) + "/" + self.saveName + "/", element)
str(self.id) + "/" + self.saveName + "/images", element)
else:
command = 'var arr = [];\
var content = arguments[0];\
@ -1965,6 +1947,8 @@ class BrowserThread(Thread):
content = element.get_attribute(attribute_name)
except:
content = ""
elif p["contentType"] == 15: # 常量值
content = p["JS"]
if content == None:
content = ""
return content
@ -2208,7 +2192,9 @@ if __name__ == '__main__':
"server_address": "http://localhost:8074",
"keyboard": True, # 是否监听键盘输入
"pause_key": "p", # 暂停键
"version": "0.6.0",
"version": "0.6.3",
"docker_driver": "",
"user_folder": "",
}
c = Config(config)
print(c)
@ -2283,7 +2269,9 @@ if __name__ == '__main__':
options.add_argument(
"--disable-blink-features=AutomationControlled") # TMALL 反扒
# 阻止http -> https的重定向
options.add_argument("--disable-features=CrossSiteDocumentBlockingIfIsolating,CrossSiteDocumentBlockingAlways,IsolateOrigins,site-per-process")
options.add_argument("--disable-web-security") # 禁用同源策略
options.add_argument('-ignore-certificate-errors')
options.add_argument('-ignore -ssl-errors')
@ -2302,35 +2290,43 @@ if __name__ == '__main__':
os.mkdir(tmp_user_folder_parent)
characters = string.ascii_letters + string.digits
for i in range(len(c.ids)):
id = c.ids[i]
# 从字符集中随机选择字符构成字符串
random_string = ''.join(random.choice(characters) for i in range(10))
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
if os.path.exists(tmp_user_data_folder):
try:
shutil.rmtree(tmp_user_data_folder)
except:
pass
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
if os.path.exists(absolute_user_data_folder):
try:
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
except:
tmp_user_data_folder = absolute_user_data_folder
print("Copy user data folder failed, use the original folder.")
print("复制用户信息目录失败,使用原始目录。")
else:
tmp_user_data_folder = absolute_user_data_folder
print("Cannot find user data folder, create a new folder.")
print("未找到用户信息目录,创建新目录。")
options = tmp_options[i]["options"]
options.add_argument(
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
options.add_argument("--profile-directory=Default")
if c.user_folder == "":
id = c.ids[i]
# 从字符集中随机选择字符构成字符串
random_string = ''.join(random.choice(characters) for i in range(10))
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
if os.path.exists(tmp_user_data_folder):
try:
shutil.rmtree(tmp_user_data_folder)
except:
pass
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
if os.path.exists(absolute_user_data_folder):
try:
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
except:
tmp_user_data_folder = absolute_user_data_folder
print("Copy user data folder failed, use the original folder.")
print("复制用户信息目录失败,使用原始目录。")
else:
tmp_user_data_folder = absolute_user_data_folder
print("Cannot find user data folder, create a new folder.")
print("未找到用户信息目录,创建新目录。")
options.add_argument(
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
print(f"Use local user data folder: {tmp_user_data_folder}")
print(f"使用本地用户信息目录: {tmp_user_data_folder}")
else:
options.add_argument(
f'--user-data-dir={c.user_folder}')
print(f"Use specifed user data folder: {c.user_folder}", ", please note if you are using docker, this user folder path should be the path inside the docker container.")
print(f"使用指定的用户信息目录: {c.user_folder}", "请注意如果您正在使用docker此用户文件夹路径应是容器内的路径。")
print(
"如果报错Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally说明有之前运行的Chrome实例没有正常关闭请关闭之前打开的所有Chrome实例后再运行程序即可。")
print(
@ -2343,9 +2339,13 @@ if __name__ == '__main__':
print("id: ", id)
if c.read_type == "remote":
print("remote")
content = requests.get(
try:
content = requests.get(
c.server_address + "/queryExecutionInstance?id=" + str(id))
service = json.loads(content.text) # 加载服务信息
service = json.loads(content.text) # 加载服务信息
except:
print("Cannot connect to the server, please make sure that the EasySpider Main Program is running, or you can change the --read_type parameter to 'local' to read the task information from the local task file without keeping the EasySpider Main Program running.")
print("无法连接到服务器请确保EasySpider主程序正在运行或者您可以将--read_type参数更改为'local'以实现从本地任务文件中读取任务信息而无需保持EasySpider主程序运行。")
else:
print("local")
local_folder = os.path.join(os.getcwd(), "execution_instances")
@ -2370,8 +2370,8 @@ if __name__ == '__main__':
cloudflare = 0
if cloudflare == 0:
options.add_argument('log-level=3') # 隐藏日志
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id))
print("Data path:", path)
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id), "files")
print("文件下载路径|File Download path:", path)
options.add_experimental_option("prefs", {
# 设置文件下载路径
"download.default_directory": path,
@ -2396,8 +2396,17 @@ if __name__ == '__main__':
except:
browser = "chrome"
if browser == "chrome":
selenium_service = Service(executable_path=driver_path)
browser_t = MyChrome(service=selenium_service, options=options)
if c.docker_driver == "":
print("Using local driver")
selenium_service = Service(executable_path=driver_path)
browser_t = MyChrome(service=selenium_service, options=options, mode='local_driver')
else:
print("Using remote driver")
# Use docker driver, default address is http://localhost:4444/wd/hub
# Headless mode
# options.add_argument("--headless")
# print("Headless mode")
browser_t = MyChrome(command_executor=c.docker_driver, options=options, mode='remote_driver')
elif browser == "edge":
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
@ -2458,6 +2467,7 @@ if __name__ == '__main__':
# print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
# 使用监听器监听键盘输入
try:
from pynput.keyboard import Key, Listener
if c.keyboard:
with Listener(on_press=on_press_creator(press_time, event),
on_release=on_release_creator(event, press_time)) as listener:

View File

@ -1 +1,50 @@
#!/bin/bash
# 使用 lsb_release 获取系统信息
os_name=$(lsb_release -si)
os_version=$(lsb_release -sr)
# 提取主版本号副版本号
major_version=$(echo $os_version | cut -d'.' -f1)
minor_version=$(echo $os_version | cut -d'.' -f2)
# 检查是否为Ubuntu且版本大于等于24.04
if [ "$os_name" == "Ubuntu" ] && [ "$major_version" -gt 24 ] || { [ "$major_version" -eq 24 ]; }; then
# 要检查的文件路径
file_path="./EasySpider/chrome-sandbox"
# 检查文件是否存在
if [ ! -e "$file_path" ]; then
echo "File Not Exist!"
exit 1
fi
# 获取文件的拥有者
owner=$(stat -c %U "$file_path")
# 获取文件的权限
permissions=$(stat -c %a "$file_path")
# 检查拥有者是否为root且权限是否为4755
if [ "$owner" != "root" ] || [ "$permissions" != "4755" ]; then
echo "这是你第一次在该Ubuntu系统上使用EasySpider请在下方输入密码来调整文件权限以使用EasySpider"
echo "This is the first time you use EasySpider in this Ubuntu system, please change your permission of the software by input your password below (should have root/sudo permission):"
sudo chown root:root "$file_path"
sudo chmod 4755 "$file_path"
sudo chown root:root "./EasySpider/resources/app/chrome_linux64/chrome-sandbox"
sudo chmod 4755 "./EasySpider/resources/app/chrome_linux64/chrome-sandbox"
fi
else
echo "如果报错“The SUID sandbox helper binary was found, but is not configured correctly”请尝试执行以下命令后再次运行EasySpider"
echo "If you encounter the error message “The SUID sandbox helper binary was found, but is not configured correctly”, please try run the following commands and run EasySpider again:"
echo ""
echo "sudo chown root:root ./EasySpider/chrome-sandbox"
echo "sudo chmod 4755 ./EasySpider/chrome-sandbox"
echo "sudo chown root:root ./EasySpider/resources/app/chrome_linux64/chrome-sandbox"
echo "sudo chmod 4755 ./EasySpider/resources/app/chrome_linux64/chrome-sandbox"
echo ""
echo ""
fi
./EasySpider/EasySpider

View File

@ -23,7 +23,7 @@ For more complex operations, please download the source code and compile it for
"""
# 请在下面编写你的代码,不要有代码缩进!!! | Please write your code below, do not indent the code!!!
print(globals())
# 导包 | Import packages
from selenium.common.exceptions import ElementClickInterceptedException
@ -56,3 +56,20 @@ finally:
print("All parameters:", self.outputParameters)
print(test(3))
print("执行完毕|Execution completed")
import time
time.sleep(3)
def new_line(outputParameters, maxViewLength, record):
line = []
print("Use this function to print a new line in the console")
i = 0
for value in outputParameters.values():
line.append(value)
if record[i]:
print(value[:maxViewLength], " ", end="")
i += 1
print("")
return line
new_line(self.outputParameters, 10, [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])

File diff suppressed because one or more lines are too long

View File

@ -1 +1 @@
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"12/7/2023, 2:56:47 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}}]}
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"2024-01-05 22:08:46","version":"0.6.0","saveThreshold":10,"quitWaitTime":3,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"},{"id":1,"name":"loopTimes_1","nodeId":5,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数0代表无限循环","type":"int","exampleValue":10,"value":10}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,5],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":2,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":4,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":3,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":2,"index":5,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[2],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//body","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":10,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}

View File

@ -1 +1 @@
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"07/12/2023, 03:43:34","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"desc":"https://www.zhihu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"2023-12-27 20:05:50","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"知了个乎","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"},{"id":1,"name":"loopTimes_1","nodeId":4,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数0代表无限循环","type":"int","exampleValue":0,"value":0}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,4,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":2,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":4,"index":3,"parentId":3,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}},{"id":2,"index":4,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}

View File

@ -1 +1 @@
{"id":70,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}
{"id":-2,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}

File diff suppressed because one or more lines are too long

View File

@ -1,8 +1,29 @@
Due to the complex security settings of MacOS, the issue of being unable to open software due to the "unverified developer" message may occur upon the first attempt to open the software. Please refer to the following GitHub document to see how to open software and perform tasks on your MacOS version:
Due to MacOS's complex security settings, software downloaded for the first time will warn that the developer is unverified and will not allow the application to run. Please follow these steps to unlock:
https://github.com/NaiboWang/EasySpider/wiki/MacOS-Guide
1. Open the system Terminal.
The main steps are as follows:
2. Navigate to the EasySpider software directory, such as:
cd ~/Downloads/EasySpider_MacOS
3. In the EasySpider directory, run the `first_time_run.sh` script to modify the package properties by using the following command:
bash first_time_run.sh
This will unlock EasySpider for both design and execution stages.
If you encounter errors such as the one below during the command execution, they can be ignored, and you may proceed to open the software after the command completes:
xattr: [Errno 13] Permission denied: 'EasySpider.app/Contents/Resources/app/node_modules/node-window-manager/build/node_gyp_bins/python3'
For another solution, refer to this video on how to open software and execute tasks in MacOS version: https://www.bilibili.com/video/BV1E34y137fT/
- Design phase - Apple Arm chip version of MacOS

File diff suppressed because one or more lines are too long

View File

@ -1 +1 @@
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"12/7/2023, 2:56:47 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}}]}
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"2024-01-05 22:08:46","version":"0.6.0","saveThreshold":10,"quitWaitTime":3,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"},{"id":1,"name":"loopTimes_1","nodeId":5,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数0代表无限循环","type":"int","exampleValue":10,"value":10}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,5],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":2,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":4,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":3,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":2,"index":5,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[2],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//body","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":10,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}

View File

@ -1 +1 @@
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"07/12/2023, 03:43:34","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"desc":"https://www.zhihu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"2023-12-27 20:05:50","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"知了个乎","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"},{"id":1,"name":"loopTimes_1","nodeId":4,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数0代表无限循环","type":"int","exampleValue":0,"value":0}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,4,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":2,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":4,"index":3,"parentId":3,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}},{"id":2,"index":4,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":309,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-24 00:34:50","update_time":"2023-12-24 00:36:58","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":1,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"},{"id":1,"name":"inputText_1","nodeName":"输入文字","nodeId":2,"desc":"要输入的文本,如京东搜索框输入:电脑","type":"text","exampleValue":"JS(\"return new Date().getYear()\")1","value":"JS(\"return new Date().getYear()\")1"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2,3],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":4,"title":"输入文字","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[@id=\"key\"]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"value":"JS(\"return new Date().getYear()\")1","index":0,"allXPaths":["/html/body/div[4]/div[1]/div[2]/div[1]/input[1]","//input[contains(., '')]","id(\"key\")","//INPUT[@class='text']","/html/body/div[last()-6]/div/div[last()-2]/div/input"]}},{"id":3,"index":3,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[4],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":4,"index":4,"parentId":3,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"ughtq41gxwnlqia7awp","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"ughtq41gxwnlqia7awp","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":311,"name":"重命名测试","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-28 14:05:20","update_time":"2023-12-28 14:05:43","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"zvn77ulso2lqoswqo4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"zvn77ulso2lqoswqo4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":315,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-29 22:34:23","update_time":"2023-12-29 22:38:36","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"Text","desc":"自定义操作返回的数据","type":"text","recordASField":1,"exampleValue":""},{"id":1,"name":"Link","desc":"自定义操作返回的数据","type":"text","recordASField":1,"exampleValue":""}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[4,5,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":5,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":""}},{"id":3,"index":4,"parentId":2,"type":0,"option":5,"title":"Text","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":0,"codeMode":2,"code":"return arguments[0].innerText","waitTime":0,"recordASField":1,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}},{"id":4,"index":5,"parentId":2,"type":0,"option":5,"title":"Link","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"codeMode":2,"code":"return arguments[0].href","waitTime":0,"recordASField":1,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}}]}

View File

@ -0,0 +1 @@
{"id":316,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-30 22:35:04","update_time":"2023-12-30 22:35:12","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"自定义操作","desc":"自定义操作返回的数据","type":"text","recordASField":0,"exampleValue":""}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":5,"title":"自定义操作","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"codeMode":12,"code":"","waitTime":0,"recordASField":0,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}}]}

View File

@ -0,0 +1 @@
{"id":317,"name":"图片下载","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2024-01-05 22:14:43","update_time":"2024-01-05 22:15:19","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数2_图片地址","desc":"","type":"text","recordASField":1,"exampleValue":"//m.360buyimg.com/babel/jfs/t1/232616/15/5744/219106/656d810aF16705ea9/41c4997dc1b81f17.png"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,3],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":-1,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":4,"contentType":0,"relative":false,"name":"参数1_图片地址","desc":"","extractType":0,"relativeXPath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/a[1]/img[1]","allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]","/html/body/div[last()-6]/div/div[last()-4]/div/div[last()-1]/div/div[last()-1]/div/div[last()-1]/div/div[last()-3]/div/div/a/img"],"exampleValues":[{"num":0,"value":"//m.360buyimg.com/babel/s1420x740_jfs/t1/194401/20/32669/76553/64142a96F7733e6ad/cf2727848c86cf45.jpg!q70.dpg"}],"unique_index":"i9in42ta6klr0pwp4k","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}},{"id":2,"index":3,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[4],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div/div[1]/div[1]/a[1]/img[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-1]/div/div[last()-1]/div/div[last()-1]/div/div[last()-4]/div/div/a/img"]}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":4,"contentType":0,"relative":true,"name":"参数2_图片地址","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"//m.360buyimg.com/babel/jfs/t1/232616/15/5744/219106/656d810aF16705ea9/41c4997dc1b81f17.png"}],"unique_index":"i81avec75qflr0pwym8","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":1,"splitLine":0}]}}]}

View File

@ -0,0 +1 @@
{"id":318,"name":"京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2024-04-22 05:08:03","update_time":"2024-04-22 05:19:48","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"电脑数码"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://prodev.jd.com/mall/active/31XPWPTonxJ9e5YoQ85HS7z8XNYQ/index.html?babelChannel=ttt40"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[4]/div[1]/div[4]/ul[1]/li/a[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[1]/div[4]/div[1]/div[4]/ul[1]/li[1]/a[1]","//a[contains(., '电脑数码')]","//A[@class='navitems-lk']","/html/body/div[last()-5]/div[last()-2]/div/div[last()-1]/ul/li[last()-8]/a"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":15,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"电脑数码"}],"unique_index":"auwkv5g1krqlva0tsc4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"123","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://prodev.jd.com/mall/active/31XPWPTonxJ9e5YoQ85HS7z8XNYQ/index.html?babelChannel=ttt40"}],"unique_index":"auwkv5g1krqlva0tsc4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}

View File

@ -0,0 +1 @@
{"id":-2,"name":"百度一下,你就知道","url":"https://www.baidu.com?id=1","links":"https://www.baidu.com?id=11\nhttps://www.baidu.com?id=12","create_time":"2024-04-22 05:45:12","update_time":"2024-04-22 05:45:20","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com?id=1","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com?id=11\nhttps://www.baidu.com?id=12","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com?id=11\nhttps://www.baidu.com?id=12"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com?id=1","links":"https://www.baidu.com?id=11\nhttps://www.baidu.com?id=12","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}}]}

View File

@ -0,0 +1 @@
{"id":320,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-04-22 05:53:18","update_time":"2024-04-22 05:53:28","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}

View File

@ -0,0 +1 @@
{"id":321,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-04-22 07:02:02","update_time":"2024-04-22 07:02:16","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}

View File

@ -0,0 +1 @@
{"id":322,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2024-04-22 08:13:15","update_time":"2024-04-22 08:13:33","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}

View File

@ -0,0 +1 @@
{"id":323,"name":"新web采集任务","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"","update_time":"2024-08-10 17:29:04","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}}]}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":325,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-12-30 22:37:29","update_time":"2024-12-30 22:37:43","version":"0.6.3","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"0暖心2024 总书记的贴心话"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://www.baidu.com/s?wd=%E6%9A%96%E5%BF%832024+%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%84%E8%B4%B4%E5%BF%83%E8%AF%9D&sa=fyb_n_homepage&rsv_dl=fyb_n_homepage&from=super&cl=3&tn=baidutop10&fr=top1000&rsv_idx=2&hisfilter=1"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li[1]/a[1]","//a[contains(., '0暖心2024 总')]","//a[@class='title-content c-link c-font-medium c-line-clamp1']","/html/body/div[last()-4]/div[last()-3]/div[last()-3]/div/div/div/ul/li[last()-9]/a"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"0暖心2024 总书记的贴心话"}],"unique_index":"8rtq2is658sm5b58osr","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://www.baidu.com/s?wd=%E6%9A%96%E5%BF%832024+%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%84%E8%B4%B4%E5%BF%83%E8%AF%9D&sa=fyb_n_homepage&rsv_dl=fyb_n_homepage&from=super&cl=3&tn=baidutop10&fr=top1000&rsv_idx=2&hisfilter=1"}],"unique_index":"8rtq2is658sm5b58osr","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}

View File

@ -1 +1 @@
{"id":70,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}
{"id":-2,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,5 @@
#!/bin/bash
xattr -cr EasySpider.app
xattr -cr easyspider_executestage
xattr -cr easyspider_executestage_full

View File

@ -23,7 +23,7 @@ For more complex operations, please download the source code and compile it for
"""
# 请在下面编写你的代码,不要有代码缩进!!! | Please write your code below, do not indent the code!!!
print(globals())
# 导包 | Import packages
from selenium.common.exceptions import ElementClickInterceptedException
@ -56,3 +56,20 @@ finally:
print("All parameters:", self.outputParameters)
print(test(3))
print("执行完毕|Execution completed")
import time
time.sleep(3)
def new_line(outputParameters, maxViewLength, record):
line = []
print("Use this function to print a new line in the console")
i = 0
for value in outputParameters.values():
line.append(value)
if record[i]:
print(value[:maxViewLength], " ", end="")
i += 1
print("")
return line
new_line(self.outputParameters, 10, [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])

View File

@ -12,6 +12,11 @@ Official documentation can be found at: https://github.com/NaiboWang/EasySpider/
Video Tutorial: https://youtube.com/playlist?list=PL0kEFEkWrT7mt9MUlEBV2DTo1QsaanUTp
You can import tasks from other machines by simply opening the EasySpider software in this directory, right-clicking "Show Package Contents", and then placing the .json files from the tasks folder in the /Users/your user name/Library/Application Support/EasySpider/tasks folder of the other machine. Similarly, execution ID files can be imported by copying the .json files from the execution_instances folder. Please note that the .json files in both folders only support names greater than 0.
You can import tasks from other machines by simply opening the EasySpider software in this directory, right-clicking "Show Package Contents", and then placing the .json files from the tasks folder in the /Users/Your User Name/Library/Application Support/EasySpider/tasks folder of the other machine. Similarly, execution ID files can be imported by copying the .json files from the execution_instances folder. Please note that the .json files in both folders only support names greater than 0.
You can quickly navigate to the tasks folder using the following commands:
cd /Users/$(whoami)/Library/Application\ Support/EasySpider/tasks
open .
If you need to press p one the keyboard to pause and continue the execution of the task, you need to grant the program keyboard monitoring permission.

View File

@ -1,6 +1,26 @@
由于MacOS复杂的安全性设置初次打开软件会显示未验证开发者从而不允许打开的问题参考以下视频来查看MacOS版本如何打开软件和执行任务https://www.bilibili.com/video/BV1E34y137fT/
由于MacOS复杂的安全性设置初次打开软件会显示未验证开发者从而不允许打开的问题通过以下方式来解锁:
主要步骤如下:
1. 打开系统terminal命令行窗口。
2. 切换到EasySpider软件目录
cd ~/Downloads/EasySpider_MacOS
3. 在EasySpider目录下使用以下命令运行目录下的`first_time_run.sh`脚本修改软件包属性:
bash first_time_run.sh
即可一键解锁并正常使用EasySpider包括设计阶段程序和执行阶段程序。
执行命令时如果出现类似下面的错误可以忽略,执行完成之后即可打开软件:
xattr: [Errno 13] Permission denied: 'EasySpider.app/Contents/Resources/app/node_modules/node-window-manager/build/node_gyp_bins/python3'
以下是另一种方案请参考以下视频来查看MacOS版本如何打开软件和执行任务https://www.bilibili.com/video/BV1E34y137fT/
- 设计阶段 - Apple Arm芯片版MacOS

View File

@ -14,4 +14,9 @@
可以从其他机器导入任务只需要把其他机器的tasks文件夹里的.json文件放入/Users/你的用户名/Library/Application Support/EasySpider/tasks文件夹里即可。同理执行号文件可以通过复制execution_instances文件夹中的.json文件来导入。注意两个文件夹里的.json文件只支持命名为大于0的数字。
可通过以下命令快速进入tasks文件夹
cd /Users/$(whoami)/Library/Application\ Support/EasySpider/tasks
open .
如果需要按p键暂停和继续任务的执行,需要赋予程序键盘监控权限。

View File

@ -23,7 +23,7 @@ For more complex operations, please download the source code and compile it for
"""
# 请在下面编写你的代码,不要有代码缩进!!! | Please write your code below, do not indent the code!!!
print(globals())
# 导包 | Import packages
from selenium.common.exceptions import ElementClickInterceptedException
@ -56,3 +56,20 @@ finally:
print("All parameters:", self.outputParameters)
print(test(3))
print("执行完毕|Execution completed")
import time
time.sleep(3)
def new_line(outputParameters, maxViewLength, record):
line = []
print("Use this function to print a new line in the console")
i = 0
for value in outputParameters.values():
line.append(value)
if record[i]:
print(value[:maxViewLength], " ", end="")
i += 1
print("")
return line
new_line(self.outputParameters, 10, [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])

File diff suppressed because one or more lines are too long

View File

@ -1 +1 @@
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"12/7/2023, 2:56:47 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}}]}
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"2024-01-05 22:08:46","version":"0.6.0","saveThreshold":10,"quitWaitTime":3,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"},{"id":1,"name":"loopTimes_1","nodeId":5,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数0代表无限循环","type":"int","exampleValue":10,"value":10}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,5],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":2,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":4,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":3,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":2,"index":5,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[2],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//body","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":10,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}

View File

@ -1 +1 @@
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"07/12/2023, 03:43:34","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"desc":"https://www.zhihu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"2023-12-27 20:05:50","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"知了个乎","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"},{"id":1,"name":"loopTimes_1","nodeId":4,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数0代表无限循环","type":"int","exampleValue":0,"value":0}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,4,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":2,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":4,"index":3,"parentId":3,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}},{"id":2,"index":4,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}

View File

@ -1 +1 @@
{"id":70,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}
{"id":-2,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}

File diff suppressed because one or more lines are too long

View File

@ -9,6 +9,7 @@ import threading
# import undetected_chromedriver as uc
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
from constants import WriteMode, DataWriteMode, GraphOption
from myChrome import MyChrome
from threading import Thread, Event
from PIL import Image
@ -31,7 +32,6 @@ from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from pynput.keyboard import Key, Listener
from datetime import datetime
import io # 遇到错误退出时应执行的代码
import json
@ -76,10 +76,7 @@ class BrowserThread(Thread):
def __init__(self, browser_t, id, service, version, event, saveName, config, option):
Thread.__init__(self)
self.logs = io.StringIO()
try:
self.log = bool(service["recordLog"])
except:
self.log = True
self.log = bool(service.get("recordLog", True))
self.browser = browser_t
self.option = option
self.config = config
@ -87,22 +84,13 @@ class BrowserThread(Thread):
self.totalSteps = 0
self.id = id
self.event = event
try:
self.saveName = service["saveName"] # 保存文件的名字
except:
now = datetime.now()
# 将时间格式化为精确到秒的字符串
self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
now = datetime.now()
self.saveName = service.get("saveName", now.strftime("%Y_%m_%d_%H_%M_%S")) # 保存文件的名字
self.OUTPUT = ""
self.SAVED = False
self.BREAK = False
self.CONTINUE = False
try:
maximizeWindow = service["maximizeWindow"]
except:
maximizeWindow = 0
if maximizeWindow == 1:
self.browser.maximize_window()
self.browser.maximize_window() if service.get("maximizeWindow") == 1 else ...
# 名称设定
if saveName != "": # 命令行覆盖保存名称
self.saveName = saveName # 保存文件的名字
@ -123,13 +111,13 @@ class BrowserThread(Thread):
self.getDataStep = 0
self.startSteps = 0
try:
startFromExit = service["startFromExit"] # 从上次退出的步骤开始
if startFromExit == 1:
if service.get("startFromExit", 0) == 1:
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r',
encoding='utf-8-sig') as file_obj:
self.startSteps = int(file_obj.read()) # 读取已执行步数
except:
pass
except Exception as e:
self.print_and_log(f"读取steps.txt失败原因{str(e)}")
if self.startSteps != 0:
self.print_and_log("此模式下任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
self.startSteps, "条。")
@ -137,7 +125,7 @@ class BrowserThread(Thread):
"will start from the last step, before we already collected", self.startSteps, " items.")
else:
self.print_and_log("此模式下任务ID", self.id,
"将从头F开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
"将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
self.print_and_log("In this mode, task ID", self.id,
"will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
stealth_path = driver_path[:driver_path.find(
@ -145,13 +133,12 @@ class BrowserThread(Thread):
with open(stealth_path, 'r') as f:
js = f.read()
self.print_and_log("Loading stealth.min.js")
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': js}) # TMALL 反扒
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js}) # TMALL 反扒
self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
WebDriverWait(self.browser, 10)
@ -164,75 +151,65 @@ class BrowserThread(Thread):
self.monitor_thread.start()
# self.browser.get('about:blank')
self.procedure = service["graph"] # 程序执行流程
try:
self.maxViewLength = service["maxViewLength"] # 最大显示长度
except:
self.maxViewLength = 15
try:
self.outputFormat = service["outputFormat"] # 输出格式
except:
self.outputFormat = "csv"
try:
self.task_version = service["version"] # 任务版本
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
pass
else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
if service["version"] != version:
self.print_and_log("版本不一致,请使用" +
service["version"] + "版本的EasySpider运行该任务")
self.print_and_log("Version not match, please use EasySpider " +
service["version"] + " to run this task!")
self.browser.quit()
sys.exit()
except: # 0.2.0版本没有version字段所以直接退出
self.maxViewLength = service.get("maxViewLength", 15) # 最大显示长度
self.outputFormat = service.get("outputFormat", "csv") # 输出格式
self.save_threshold = service.get("saveThreshold", 10) # 保存最低阈值
self.dataWriteMode = service.get("dataWriteMode", DataWriteMode.Append.value) # 数据写入模式1为追加2为覆盖3为重命名文件
self.task_version = service.get("version", "") # 任务版本
if not self.task_version:
self.print_and_log("版本不一致请使用v0.2.0版本的EasySpider运行该任务")
self.print_and_log(
"Version not match, please use EasySpider v0.2.0 to run this task!")
self.print_and_log("Version not match, please use EasySpider v0.2.0 to run this task!")
self.browser.quit()
sys.exit()
try:
self.save_threshold = service["saveThreshold"] # 保存最低阈值
except:
self.save_threshold = 10
try:
self.links = list(
filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
except:
if self.task_version >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
pass
elif self.task_version != version: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
self.print_and_log(f"版本不一致,请使用{self.task_version}版本的EasySpider运行该任务")
self.print_and_log(f"Version not match, please use EasySpider {self.task_version} to run this task!")
self.browser.quit()
sys.exit()
service_links = service.get("links")
if service_links:
self.links = list(filter(isnotnull, service_links.split("\n"))) # 要执行的link的列表
else:
self.links = list(filter(isnotnull, service["url"])) # 要执行的link
self.OUTPUT = [] # 采集的数据
try:
self.dataWriteMode = service["dataWriteMode"] # 数据写入模式1为追加2为覆盖3为重命名文件
except:
self.dataWriteMode = 1
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx" or self.outputFormat == "json":
if self.outputFormat in ["csv", "txt", "xlsx", "json"]:
if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
if self.dataWriteMode == 2:
if self.dataWriteMode == DataWriteMode.Cover.value:
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
elif self.dataWriteMode == 3:
elif self.dataWriteMode == DataWriteMode.Rename.value:
i = 2
while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat):
i = i + 1
self.saveName = self.saveName + '_' + str(i)
self.print_and_log("文件已存在,已重命名为", self.saveName)
self.writeMode = 1 # 写入模式0为新建1为追加
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
self.writeMode = WriteMode.Create.value # 写入模式0为新建1为追加
if self.outputFormat in ['csv', 'txt', 'xlsx']:
if not os.path.exists(f"Data/Task_{str(self.id)}/{self.saveName}.{self.outputFormat}"):
self.OUTPUT.append([]) # 添加表头
self.writeMode = 0
self.writeMode = WriteMode.Create.value
elif self.outputFormat == "json":
self.writeMode = 3 # JSON模式无需判断是否存在文件
self.writeMode = WriteMode.Json.value # JSON模式无需判断是否存在文件
elif self.outputFormat == "mysql":
self.mysql = myMySQL(config["mysql_config_path"])
self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2)
self.writeMode = 2
if self.writeMode == 0:
self.mysql.create_table(self.saveName, service["outputParameters"],
remove_if_exists=self.dataWriteMode == DataWriteMode.Cover.value)
self.writeMode = WriteMode.MySQL.value # MySQL模式
if self.writeMode == WriteMode.Create.value:
self.print_and_log("新建模式|Create Mode")
elif self.writeMode == 1:
elif self.writeMode == WriteMode.Append.value:
self.print_and_log("追加模式|Append Mode")
elif self.writeMode == 2:
elif self.writeMode == WriteMode.MySQL.value:
self.print_and_log("MySQL模式|MySQL Mode")
elif self.writeMode == 3:
elif self.writeMode == WriteMode.Json.value:
self.print_and_log("JSON模式|JSON Mode")
self.containJudge = service["containJudge"] # 是否含有判断语句
self.outputParameters = {}
self.service = service
@ -245,191 +222,140 @@ class BrowserThread(Thread):
if param["name"] not in self.outputParameters.keys():
self.outputParameters[param["name"]] = ""
self.dataNotFoundKeys[param["name"]] = False
try:
self.outputParametersTypes.append(param["type"])
except:
self.outputParametersTypes.append("text")
try:
self.outputParametersRecord.append(
bool(param["recordASField"]))
except:
self.outputParametersRecord.append(True)
self.outputParametersTypes.append(param.get("type", "text"))
self.outputParametersRecord.append(bool(param.get("recordASField", True)))
# 文件叠加的时候不添加表头
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
if self.writeMode == 0:
self.OUTPUT[0].append(param["name"])
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create.value:
self.OUTPUT[0].append(param["name"])
self.urlId = 0 # 全局记录变量
self.preprocess() # 预处理,优化提取数据流程
try:
self.inputExcel = service["inputExcel"] # 输入Excel
except:
self.inputExcel = ""
self.inputExcel = service.get("inputExcel", "") # 输入Excel
self.readFromExcel() # 读取Excel获得参数值
# 检测如果没有复杂的操作,优化提取数据流程
def preprocess(self):
for node in self.procedure:
try:
iframe = node["parameters"]["iframe"]
except:
node["parameters"]["iframe"] = False
for index_node, node in enumerate(self.procedure):
parameters: dict = node["parameters"]
iframe = parameters.get('iframe')
option = node["option"]
try:
node["parameters"]["xpath"] = lowercase_tags_in_xpath(
node["parameters"]["xpath"])
except:
pass
try:
node["parameters"]["waitElementIframeIndex"] = int(
node["parameters"]["waitElementIframeIndex"])
except:
node["parameters"]["waitElement"] = ""
node["parameters"]["waitElementTime"] = 10
node["parameters"]["waitElementIframeIndex"] = 0
if node["option"] == 1: # 打开网页操作
try:
cookies = node["parameters"]["cookies"]
except:
node["parameters"]["cookies"] = ""
elif node["option"] == 2: # 点击操作
try:
alertHandleType = node["parameters"]["alertHandleType"]
except:
node["parameters"]["alertHandleType"] = 0
if node["parameters"]["useLoop"]:
parameters["iframe"] = False if not iframe else parameters.get('iframe', False)
if parameters.get("xpath"):
parameters["xpath"] = lowercase_tags_in_xpath(parameters["xpath"])
if parameters.get("waitElementIframeIndex"):
parameters["waitElementIframeIndex"] = int(parameters["waitElementIframeIndex"])
else:
parameters["waitElement"] = ""
parameters["waitElementTime"] = 10
parameters["waitElementIframeIndex"] = 0
if option == GraphOption.Get.value: # 打开网页操作
parameters["cookies"] = parameters.get("cookies", "")
elif option == GraphOption.Click.value: # 点击操作
parameters["alertHandleType"] = parameters.get("alertHandleType", 0)
if parameters.get("useLoop"):
if self.task_version <= "0.3.5":
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
node["parameters"]["xpath"] = ""
self.print_and_log("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif node["option"] == 3: # 提取数据操作
node["parameters"]["recordASField"] = 0
try:
params = node["parameters"]["params"]
except:
node["parameters"]["params"] = node["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
params = node["parameters"]["params"]
try:
clear = node["parameters"]["clear"]
except:
node["parameters"]["clear"] = 0
try:
newLine = node["parameters"]["newLine"]
except:
node["parameters"]["newLine"] = 1
parameters["xpath"] = ""
self.print_and_log(f"您的任务版本号为{self.task_version}循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif option == GraphOption.Extract.value: # 提取数据操作
parameters["recordASField"] = 0
parameters["params"] = parameters.get("params", parameters.get("paras")) # 兼容0.5.0及以下版本的EasySpider
parameters["clear"] = parameters.get("clear", 0)
parameters["newLine"] = parameters.get("newLine", 1)
params = parameters["params"]
for param in params:
try:
iframe = param["iframe"]
except:
param["iframe"] = False
try:
param["iframe"] = param.get("iframe", False)
if param.get("relativeXPath"):
param["relativeXPath"] = lowercase_tags_in_xpath(param["relativeXPath"])
except:
pass
try:
node["parameters"]["recordASField"] = param["recordASField"]
except:
node["parameters"]["recordASField"] = 1
try:
splitLine = int(param["splitLine"])
except:
param["splitLine"] = 0
if param["contentType"] == 8:
self.print_and_log(
"默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片保存下来然后用自定义操作调用自己写的程序程序的功能是读取这个最新生成的图片然后用好用的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。")
self.print_and_log(
"If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
parameters["recordASField"] = param.get("recordASField", 1)
param["splitLine"] = 0 if not param.get("splitLine") else param.get("splitLine")
if param.get("contentType") == 8:
self.print_and_log("默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType =="
"8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片"
"保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用"
"的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。")
self.print_and_log("If you think the default ddddocr function is not good enough, you can "
"modify the source code get_content function -> contentType == 8 position "
"to your own OCR model and then compile and run it; or you can first set "
"the content type of the crawler to \"Element Screenshot\" to save the "
"picture, and then call your own program with custom operations. The "
"function of the program is to read the latest generated picture, then use "
"a good model, such as PaddleOCR to recognize the picture, and then return "
"the return value as a parameter output to the program.")
param["optimizable"] = detect_optimizable(param)
elif node["option"] == 4: # 输入文字
try:
index = node["parameters"]["index"] # 索引值
except:
node["parameters"]["index"] = 0
elif node["option"] == 5: # 自定义操作
try:
clear = node["parameters"]["clear"]
except:
node["parameters"]["clear"] = 0
try:
newLine = node["parameters"]["newLine"]
except:
node["parameters"]["newLine"] = 1
elif node["option"] == 7: # 移动到元素
if node["parameters"]["useLoop"]:
if self.task_version <= "0.3.5":
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
node["parameters"]["xpath"] = ""
self.print_and_log("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif node["option"] == 8: # 循环操作
try:
exitElement = node["parameters"]["exitElement"]
if exitElement == "":
node["parameters"]["exitElement"] = "//body"
except:
node["parameters"]["exitElement"] = "//body"
node["parameters"]["quickExtractable"] = False # 是否可以快速提取
try:
skipCount = node["parameters"]["skipCount"]
except:
node["parameters"]["skipCount"] = 0
elif option == GraphOption.Input.value: # 输入文字
parameters['index'] = parameters.get('index', 0)
elif option == GraphOption.Custom.value: # 自定义操作
parameters['clear'] = parameters.get('clear', 0)
parameters['newLine'] = parameters.get('newLine', 1)
elif option == GraphOption.Move.value: # 移动到元素
if parameters.get('useLoop'):
if self.task_version <= "0.3.5": # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
parameters["xpath"] = ""
self.print_and_log(f"您的任务版本号为{self.task_version}循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif option == GraphOption.Loop.value: # 循环操作
parameters['exitElement'] = "//body" if not parameters.get('exitElement') or parameters.get('exitElement') == "" else parameters.get('exitElement')
parameters["quickExtractable"] = False # 是否可以快速提取
parameters['skipCount'] = parameters.get('skipCount', 0)
# 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
try:
params = self.procedure[node["sequence"][0]]["parameters"]["params"]
except:
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
try:
waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"]
except:
waitElement = ""
if node["parameters"]["iframe"]:
node["parameters"]["quickExtractable"] = False # 如果是iframe那么不可以快速提取
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 \
and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
params = self.procedure[node["sequence"][0]].get("parameters").get("params")
if not params:
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
waitElement = self.procedure[node["sequence"][0]]["parameters"].get("waitElement", "")
if parameters["iframe"]:
parameters["quickExtractable"] = False # 如果是iframe那么不可以快速提取
else:
node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
if node["parameters"]["skipCount"] > 0:
node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
parameters["quickExtractable"] = True # 先假设可以快速提取
if parameters["skipCount"] > 0:
parameters["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
for param in params:
optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
try:
iframe = param["iframe"]
except:
param["iframe"] = False
if param["iframe"] and not param["relative"]: # 如果是iframe那么不可以快速提取
param['iframe'] = param.get('iframe', False)
if param["iframe"] and not param["relative"]: # 如果是iframe那么不可以快速提取
optimizable = False
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
node["parameters"]["quickExtractable"] = False
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
parameters["quickExtractable"] = False
break
if node["parameters"]["quickExtractable"]:
self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据")
self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly")
try:
node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"]
except:
node["parameters"]["clear"] = 0
try:
node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"]
except:
node["parameters"]["newLine"] = 1
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
if parameters["quickExtractable"]:
self.print_and_log(f"循环操作<{node['title']}>可以快速提取数据")
self.print_and_log(f"Loop operation <{node['title']}> can extract data quickly")
parameters["clear"] = self.procedure[node["sequence"][0]]["parameters"].get("clear", 0)
parameters["newLine"] = self.procedure[node["sequence"][0]]["parameters"].get("newLine", 1)
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
node["parameters"]["baseXPath"] = node["parameters"]["xpath"]
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
node["parameters"]["baseXPath"] = node["parameters"]["pathList"]
node["parameters"]["quickParams"] = []
for param in params:
content_type = ""
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 or param["relativeXPath"].find(
"::text()") >= 0:
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 \
or param["relativeXPath"].find("::text()") >= 0:
content_type = ""
elif param["nodeType"] == 2:
content_type = "//@href"
elif param["nodeType"] == 4: # 图片链接
elif param["nodeType"] == 4: # 图片链接
content_type = "//@src"
elif param["contentType"] == 1:
content_type = "/text()"
elif param["contentType"] == 0:
content_type = "//text()"
if param["relative"]: # 如果是相对XPath
if param["relative"]: # 如果是相对XPath
xpath = "." + param["relativeXPath"] + content_type
else:
xpath = param["relativeXPath"] + content_type
@ -443,6 +369,7 @@ class BrowserThread(Thread):
"nodeType": param["nodeType"],
"default": param["default"],
})
self.procedure[index_node]["parameters"] = parameters
self.print_and_log("预处理完成|Preprocess completed")
def readFromExcel(self):
@ -559,7 +486,10 @@ class BrowserThread(Thread):
self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
time.sleep(quitWaitTime)
self.browser.quit()
try:
self.browser.quit()
except:
pass
self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
try:
shutil.rmtree(self.option["tmp_user_data_folder"])
@ -775,18 +705,20 @@ class BrowserThread(Thread):
self.browser.set_script_timeout(max_wait_time)
try:
output = self.browser.execute_script(code)
except:
except Exception as e:
output = ""
self.recordLog("JavaScript execution failed")
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
elif int(codeMode) == 2:
self.recordLog("Execute JavaScript for element:" + code)
self.recordLog("对元素执行JavaScript:" + code)
self.browser.set_script_timeout(max_wait_time)
try:
output = self.browser.execute_script(code, element)
except:
except Exception as e:
output = ""
self.recordLog("JavaScript execution failed")
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
elif int(codeMode) == 5:
try:
code = readCode(code)
@ -796,9 +728,9 @@ class BrowserThread(Thread):
self.recordLog("执行下面的代码:" + code)
self.recordLog("Execute the following code:" + code)
except Exception as e:
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", e)
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
self.print_and_log("Error executing the following code:" +
code, ", error is:", e)
code, ", error is:", str(e))
elif int(codeMode) == 6:
try:
code = readCode(code)
@ -1216,6 +1148,14 @@ class BrowserThread(Thread):
self.history["handle"] = thisHandle
thisHistoryURL = self.browser.current_url
# 快速提取处理
# start = time.time()
try:
tree = html.fromstring(self.browser.page_source)
except Exception as e:
self.print_and_log("解析页面时出错,将切换普通提取模式|Error parsing page, will switch to normal extraction mode")
node["parameters"]["quickExtractable"] = False
# end = time.time()
# print("解析页面秒数:", end - start)
if node["parameters"]["quickExtractable"]:
self.browser.switch_to.default_content() # 切换到主页面
tree = html.fromstring(self.browser.page_source)
@ -1721,8 +1661,11 @@ class BrowserThread(Thread):
try:
actions = ActionChains(self.browser) # 实例化一个action对象
if newTab == 1: # 在新标签页打开
# Ctrl + Click
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
if sys.platform == "darwin": # Mac
actions.key_down(Keys.COMMAND).click(element).key_up(Keys.COMMAND).perform()
else:
# Ctrl + Click
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
else:
actions.click(element).perform()
except Exception as e:
@ -2249,7 +2192,9 @@ if __name__ == '__main__':
"server_address": "http://localhost:8074",
"keyboard": True, # 是否监听键盘输入
"pause_key": "p", # 暂停键
"version": "0.6.2",
"version": "0.6.3",
"docker_driver": "",
"user_folder": "",
}
c = Config(config)
print(c)
@ -2345,35 +2290,43 @@ if __name__ == '__main__':
os.mkdir(tmp_user_folder_parent)
characters = string.ascii_letters + string.digits
for i in range(len(c.ids)):
id = c.ids[i]
# 从字符集中随机选择字符构成字符串
random_string = ''.join(random.choice(characters) for i in range(10))
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
if os.path.exists(tmp_user_data_folder):
try:
shutil.rmtree(tmp_user_data_folder)
except:
pass
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
if os.path.exists(absolute_user_data_folder):
try:
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
except:
tmp_user_data_folder = absolute_user_data_folder
print("Copy user data folder failed, use the original folder.")
print("复制用户信息目录失败,使用原始目录。")
else:
tmp_user_data_folder = absolute_user_data_folder
print("Cannot find user data folder, create a new folder.")
print("未找到用户信息目录,创建新目录。")
options = tmp_options[i]["options"]
options.add_argument(
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
options.add_argument("--profile-directory=Default")
if c.user_folder == "":
id = c.ids[i]
# 从字符集中随机选择字符构成字符串
random_string = ''.join(random.choice(characters) for i in range(10))
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
if os.path.exists(tmp_user_data_folder):
try:
shutil.rmtree(tmp_user_data_folder)
except:
pass
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
if os.path.exists(absolute_user_data_folder):
try:
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
except:
tmp_user_data_folder = absolute_user_data_folder
print("Copy user data folder failed, use the original folder.")
print("复制用户信息目录失败,使用原始目录。")
else:
tmp_user_data_folder = absolute_user_data_folder
print("Cannot find user data folder, create a new folder.")
print("未找到用户信息目录,创建新目录。")
options.add_argument(
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
print(f"Use local user data folder: {tmp_user_data_folder}")
print(f"使用本地用户信息目录: {tmp_user_data_folder}")
else:
options.add_argument(
f'--user-data-dir={c.user_folder}')
print(f"Use specifed user data folder: {c.user_folder}, please note if you are using docker, this user folder path should be the path inside the docker container.")
print(f"使用指定的用户信息目录: {c.user_folder}请注意如果您正在使用docker此用户文件夹路径应是容器内的路径。")
print(
"如果报错Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally说明有之前运行的Chrome实例没有正常关闭请关闭之前打开的所有Chrome实例后再运行程序即可。")
print(
@ -2386,9 +2339,13 @@ if __name__ == '__main__':
print("id: ", id)
if c.read_type == "remote":
print("remote")
content = requests.get(
try:
content = requests.get(
c.server_address + "/queryExecutionInstance?id=" + str(id))
service = json.loads(content.text) # 加载服务信息
service = json.loads(content.text) # 加载服务信息
except:
print("Cannot connect to the server, please make sure that the EasySpider Main Program is running, or you can change the --read_type parameter to 'local' to read the task information from the local task file without keeping the EasySpider Main Program running.")
print("无法连接到服务器请确保EasySpider主程序正在运行或者您可以将--read_type参数更改为'local'以实现从本地任务文件中读取任务信息而无需保持EasySpider主程序运行。")
else:
print("local")
local_folder = os.path.join(os.getcwd(), "execution_instances")
@ -2439,8 +2396,17 @@ if __name__ == '__main__':
except:
browser = "chrome"
if browser == "chrome":
selenium_service = Service(executable_path=driver_path)
browser_t = MyChrome(service=selenium_service, options=options)
if c.docker_driver == "":
print("Using local driver")
selenium_service = Service(executable_path=driver_path)
browser_t = MyChrome(service=selenium_service, options=options, mode='local_driver')
else:
print("Using remote driver")
# Use docker driver, default address is http://localhost:4444/wd/hub
# Headless mode
# options.add_argument("--headless")
# print("Headless mode")
browser_t = MyChrome(command_executor=c.docker_driver, options=options, mode='remote_driver')
elif browser == "edge":
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
@ -2501,6 +2467,7 @@ if __name__ == '__main__':
# print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
# 使用监听器监听键盘输入
try:
from pynput.keyboard import Key, Listener
if c.keyboard:
with Listener(on_press=on_press_creator(press_time, event),
on_release=on_release_creator(event, press_time)) as listener:

View File

@ -19,11 +19,16 @@ desired_capabilities["pageLoadStrategy"] = "none"
class MyChrome(webdriver.Chrome):
class MyChrome(webdriver.Chrome, webdriver.Remote):
def __init__(self, *args, **kwargs):
def __init__(self, mode='local_driver', *args, **kwargs):
self.iframe_env = False # 现在的环境是root还是iframe
super().__init__(*args, **kwargs) # 调用父类的 __init__
self.mode = mode
if mode == "local_driver":
webdriver.Chrome.__init__(self, *args, **kwargs)
elif mode == "remote_driver":
webdriver.Remote.__init__(self, *args, **kwargs)
# super().__init__(*args, **kwargs) # 调用父类的 __init__
# def find_element(self, by=By.ID, value=None, iframe=False):
# # 在这里改变查找元素的行为

View File

@ -64,49 +64,49 @@ def compress_folder_to_7z_split(folder_path, output_file):
except:
subprocess.call(["7zz", "a", "-v95m", output_file, folder_path])
easyspider_version = "0.6.2"
easyspider_version = "0.6.3"
if __name__ == "__main__":
if sys.platform == "win32" and platform.architecture()[0] == "64bit":
file_name = f"EasySpider_{easyspider_version}_windows_x64.7z"
if os.path.exists("./EasySpider_windows_x64/user_data"):
shutil.rmtree("./EasySpider_windows_x64/user_data")
if os.path.exists("./EasySpider_windows_x64/Data"):
shutil.rmtree("./EasySpider_windows_x64/Data")
if os.path.exists("./EasySpider_windows_x64/execution_instances"):
shutil.rmtree("./EasySpider_windows_x64/execution_instances")
if os.path.exists("./EasySpider_windows_x64/config.json"):
os.remove("./EasySpider_windows_x64/config.json")
if os.path.exists("./EasySpider_windows_x64/mysql_config.json"):
os.remove("./EasySpider_windows_x64/mysql_config.json")
if os.path.exists("./EasySpider_windows_x64/TempUserDataFolder"):
shutil.rmtree("./EasySpider_windows_x64/TempUserDataFolder")
os.mkdir("./EasySpider_windows_x64/Data")
os.mkdir("./EasySpider_windows_x64/execution_instances")
# compress_folder_to_7z_split("./EasySpider_windows_x64", file_name)
file_name = f"EasySpider_{easyspider_version}_Windows_x64.7z"
if os.path.exists("./EasySpider_Windows_x64/user_data"):
shutil.rmtree("./EasySpider_Windows_x64/user_data")
if os.path.exists("./EasySpider_Windows_x64/Data"):
shutil.rmtree("./EasySpider_Windows_x64/Data")
if os.path.exists("./EasySpider_Windows_x64/execution_instances"):
shutil.rmtree("./EasySpider_Windows_x64/execution_instances")
if os.path.exists("./EasySpider_Windows_x64/config.json"):
os.remove("./EasySpider_Windows_x64/config.json")
if os.path.exists("./EasySpider_Windows_x64/mysql_config.json"):
os.remove("./EasySpider_Windows_x64/mysql_config.json")
if os.path.exists("./EasySpider_Windows_x64/TempUserDataFolder"):
shutil.rmtree("./EasySpider_Windows_x64/TempUserDataFolder")
os.mkdir("./EasySpider_Windows_x64/Data")
os.mkdir("./EasySpider_Windows_x64/execution_instances")
# compress_folder_to_7z_split("./EasySpider_Windows_x64", file_name)
# print(f"Compress {file_name} Split successfully!")
compress_folder_to_7z("./EasySpider_windows_x64", file_name)
compress_folder_to_7z("./EasySpider_Windows_x64", file_name)
print(f"Compress {file_name} successfully!")
elif sys.platform == "win32" and platform.architecture()[0] == "32bit":
file_name = f"EasySpider_{easyspider_version}_windows_x32.7z"
if os.path.exists("./EasySpider_windows_x32/user_data"):
shutil.rmtree("./EasySpider_windows_x32/user_data")
if os.path.exists("./EasySpider_windows_x32/Data"):
shutil.rmtree("./EasySpider_windows_x32/Data")
if os.path.exists("./EasySpider_windows_x32/execution_instances"):
shutil.rmtree("./EasySpider_windows_x32/execution_instances")
if os.path.exists("./EasySpider_windows_x32/config.json"):
os.remove("./EasySpider_windows_x32/config.json")
if os.path.exists("./EasySpider_windows_x32/mysql_config.json"):
os.remove("./EasySpider_windows_x32/mysql_config.json")
if os.path.exists("./EasySpider_windows_x32/TempUserDataFolder"):
shutil.rmtree("./EasySpider_windows_x32/TempUserDataFolder")
os.mkdir("./EasySpider_windows_x32/Data")
os.mkdir("./EasySpider_windows_x32/execution_instances")
# compress_folder_to_7z_split("./EasySpider_windows_x32", file_name)
file_name = f"EasySpider_{easyspider_version}_Windows_x32.7z"
if os.path.exists("./EasySpider_Windows_x32/user_data"):
shutil.rmtree("./EasySpider_Windows_x32/user_data")
if os.path.exists("./EasySpider_Windows_x32/Data"):
shutil.rmtree("./EasySpider_Windows_x32/Data")
if os.path.exists("./EasySpider_Windows_x32/execution_instances"):
shutil.rmtree("./EasySpider_Windows_x32/execution_instances")
if os.path.exists("./EasySpider_Windows_x32/config.json"):
os.remove("./EasySpider_Windows_x32/config.json")
if os.path.exists("./EasySpider_Windows_x32/mysql_config.json"):
os.remove("./EasySpider_Windows_x32/mysql_config.json")
if os.path.exists("./EasySpider_Windows_x32/TempUserDataFolder"):
shutil.rmtree("./EasySpider_Windows_x32/TempUserDataFolder")
os.mkdir("./EasySpider_Windows_x32/Data")
os.mkdir("./EasySpider_Windows_x32/execution_instances")
# compress_folder_to_7z_split("./EasySpider_Windows_x32", file_name)
# print(f"Compress {file_name} Split successfully!")
compress_folder_to_7z("./EasySpider_windows_x32", file_name)
compress_folder_to_7z("./EasySpider_Windows_x32", file_name)
print(f"Compress {file_name} successfully!")
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
file_name = f"EasySpider_{easyspider_version}_Linux_x64.tar.xz"

Binary file not shown.

Binary file not shown.

View File

@ -1,4 +1,8 @@
# 环境编译说明|Environment Compilation Instruction
## 视频教程
[从源代码编译程序并设计运行和调试任务指南基于Ubuntu24.04](https://www.bilibili.com/video/BV1VE421P7yj/)
# 环境编译说明 | Environment Compilation Instruction
EasySpider分三部分
@ -19,35 +23,35 @@ EasySpider is divided into three parts:
This section covers the compilation instructions for the `main program`.
## 建议编译顺序|Suggested Compilation Order
## 建议编译顺序 | Suggested Compilation Order
1. 编译浏览器扩展,否则在主程序执行时会提示找不到`EasySpider_zh.crx`的错误。
2. 编译主程序,此时主程序可以正常运行,但无法执行任务,只能设计任务。
3. 编译执行阶段程序,否则无法执行程序,只能设计程序
3. 编译执行阶段程序,否则无法执行任务,只能设计任务
-----
1. Compile the browser extension, otherwise an error will be prompted when the main program is executed that `EasySpider_en.crx` cannot be found.
2. Compile the main program, at this time the main program can run normally, but can not execute the task, can only design the task.
3. Compile the execution stage program, otherwise the program cannot be executed, can only design the program.
3. Compile the execution stage program, otherwise the task cannot be executed, can only design the task.
## 注意事项|Note
## 注意事项 | Note
请记住每当EasySpider扩展程序和执行程序更新时都要更新`EasySpider.crx``easyspider_executestage`文件。
Remember to update the `EasySpider.crx` and `easyspider_executestage` files whenever the EasySpider extension and execution program are updated.
## 环境构建|Environment Setup
## 环境构建 | Environment Setup
以下以Windows x64版本为例。
Taking the example of Windows x64 version.
### 浏览器和驱动|Browser and Driver
### 浏览器和驱动 | Browser and Driver
实在搞不定本节的情况下下载一个直接能用的EasySpider并把文件夹内的`EasySpider\resources\app\chrome_win64`文件夹拷贝到此`ElectronJS`文件夹下即可。
实在搞不定本节的情况下下载一个直接能用的EasySpider并把文件夹内的`EasySpider\resources\app\chrome_win64`文件夹拷贝到此`ElectronJS`文件夹下,并把`chrome_win64`文件夹下的`execute.sh`在原文件夹下复制一份并命名为`execute_win64.sh`即可。
If you're unable to handle the tasks in this section, you can download a ready-to-use EasySpider. Simply copy the `EasySpider\resources\app\chrome_win64` folder from the downloaded files and paste it into the ElectronJS folder.
If you're unable to handle the tasks in this section, you can download a ready-to-use EasySpider, and copy the `EasySpider\resources\app\chrome_win64` folder to this `ElectronJS` folder, then copy the `execute.sh` script found in the `chrome_win64` folder and rename it as `execute_win64.sh` in the same location.
------
@ -66,7 +70,7 @@ chrome_linux64/ # for linux x64
chrome_mac64/ # for mac x64
```
然后,从下面的页面下载和**自己安装的Chrome版本一致**的Chromedriver[https://chromedriver.chromium.org/downloads](https://chromedriver.chromium.org/downloads)把chromedriver放入刚刚的`chrome`文件夹内,并更名为下面的格式:
然后,从下面的页面下载和**自己安装的Chrome版本一致**的Chromedriver[https://googlechromelabs.github.io/chrome-for-testing/](https://googlechromelabs.github.io/chrome-for-testing/)把chromedriver放入刚刚的`chrome`文件夹内,并更名为下面的格式:
```
chromedriver_win32.exe # for windows x32
@ -77,7 +81,7 @@ chromedriver_mac64 # for mac x64
例如如果您想在Windows x64平台上构建此软件那么您首先需要下载适用于Windows x64的Chrome浏览器并将整个`chrome`文件夹复制到`ElectronJS`文件夹中,然后将文件夹重命名为`chrome_win64`。假设您下载的Chrome版本是110。接下来下载一个适用于Windows x64的110版本的ChromeDriver并将其放入`chrome_win64`文件夹中,然后将其重命名为`chromedriver_win64.exe`
最后,把此文件夹内的`stealth.min.js``execute.bat`文件拷贝入`chrome`文件夹内。
最后,把此`ElectronJS`文件夹内的`stealth.min.js``execute_win64.bat`文件拷贝入`chrome_win64`文件夹内**这一步不要忘**
Download a Chrome from the Internet: https://www.google.com/chrome/, and then put them into this folder, with name format of the following:
@ -100,33 +104,31 @@ chromedriver_mac64 # for mac x64
For example, if you want to build this software on Windows x64 platform, then you should first download a Chrome for Windows x64, then copy the whole `chrome` folder to this `ElectronJS` folder and rename the folder to `chrome_win64`, assume the Chrome version you downloaded is 110; then, download a `chromedriver.exe` with version 110 for Windows x64, and put it into the `chrome_win64` folder, then rename it to `chromedriver_win64.exe`.
Finally, copy the `stealth.min.js` and `execute.bat` (for Windows x64) file in this folder to these `chrome` folders.
Finally, copy the `stealth.min.js` and `execute_win64.bat` file in this `ElectronJS` folder to the `chrome_win64` folder **(do not forget this step)**.
### NodeJS环境|NodeJS Environment
### NodeJS环境 | NodeJS Environment
1. Windows环境下需要先安装`VS Build Tools 2017` [https://aka.ms/vs/15/release/vs_buildtools.exe](https://aka.ms/vs/15/release/vs_buildtools.exe))的`Visual C++ Build Tools`组件,不然下面的命令无法执行,其他系统不需要。
1. Windows环境下需要先下载`VS Build Tools 2017` [https://aka.ms/vs/15/release/vs_buildtools.exe](https://aka.ms/vs/15/release/vs_buildtools.exe)并勾选安装其中`Visual C++ Build ToolsVisual C++生成工具)`组件以便`node-gyp`模块来安装`node-windows-manager`,不然下面的命令无法执行,其他系统不需要。同时,`Python3`也需要安装在系统中并配置好环境变量。
2. 安装`NodeJS`[https://nodejs.org/zh-cn/download/](https://nodejs.org/zh-cn/download/)。
3. 运行下面的命令来安装依赖:
```
npm install
npm install @electron-forge/cli -g
```
如果上面的命令运行速度很慢可以参考NodeJS换源说明[https://blog.csdn.net/qq_23211463/article/details/123769061](https://blog.csdn.net/qq_23211463/article/details/123769061)。
如果上面的命令运行速度很慢可以参考使用NodeJS和Electron包的换源说明来加速安装[https://blog.csdn.net/qq_38463737/article/details/140277803](https://blog.csdn.net/qq_38463737/article/details/140277803)。
-----
1. On Windows, you need to install `VS Build Tools 2017` (https://aka.ms/vs/15/release/vs_buildtools.exe, select and install the `Visual C++ Build Tools` component) first for node-gyp to install `node-windows-manager` (No need for other OS).
1. On Windows, you need to download `VS Build Tools 2017` (https://aka.ms/vs/15/release/vs_buildtools.exe, select and install the `Visual C++ Build Tools` component) first for the module `node-gyp` to install `node-windows-manager` (No need for other OS). Meanwhile, `Python3` needs to be installed and the environment variables need to be configured.
2. Install `NodeJS`: [https://nodejs.org/en/download/](https://nodejs.org/en/download/).
3. Run the following commands to install NodeJS packages:
```
npm install
npm install @electron-forge/cli -g
```
## 运行说明|Run Instruction
## 运行说明 | Run Instruction
在当前文件夹执行以下命令即可在开发模式下运行程序:
@ -146,14 +148,13 @@ npm run start_direct
But so far can only design the task, can not execute the task, want to execute the task also need to complete the 'ExecuteStage' folder of the execution of the task program compilation instructions can be executed.
## 打包发布说明|Package Instruction
## 打包发布说明 | Package Instruction
打包发布前,确保执行阶段程序`easyspider_executestage(.exe)`已放入`chrome(_win64)`文件夹内,且浏览器插件`EasySpider_zh.crx`已经是最新版本。
执行下面的命令即可打包:
执行下面的命令即可打包(需要安装`Git`
```
npx electron-forge import
npm run package
```
@ -161,10 +162,9 @@ npm run package
Before packaging and releasing, make sure that the task execution program `easyspider_executestage(.exe)` is placed inside the `chrome(_win64)` folder and that the browser extension `EasySpider_en.crx` is the latest version.
After finishing developing, package software by the following command:
After finishing developing, package software by the following command (`Git` is required):
```
npx electron-forge import
npm run package
```
@ -186,8 +186,43 @@ package_win64.cmd
clean_and_release_win64.cmd
```
### (可选)编译成安装包|(Optional) Compile to an installation package
## 可能出现的问题 | Troubleshooting
以下命令一般不需要执行,但打包时可能会用到:
```sh
npm install @electron-forge/cli -g
npx electron-forge import
```
npm run make
如果任务执行到`npm install electron-squirrel-startup`的步骤时卡死,请参考下面的换源教程:[https://blog.csdn.net/qq_38463737/article/details/140277803](https://blog.csdn.net/qq_38463737/article/details/140277803)。
Windows端如果在运行`npm run package`的时候提示`node-gyp`相关的错误,可以安装`electron-rebuild`并重新编译相关模块:
```sh
npm install --save-dev electron-rebuild
npx electron-rebuild
```
然后再次运行`npm run package`
-----
The following commands are generally not required, but may be used during packaging:
```sh
npm install @electron-forge/cli -g
npx electron-forge import
```
If the task is stuck at the `npm install electron-squirrel-startup` step, please refer to the following tutorial on changing the source: [https://blog.csdn.net/qq_38463737/article/details/140277803](https://blog.csdn.net/qq_38463737/article/details/140277803).
If you encounter `node-gyp` related errors when running `npm run package` on Windows, you can install `electron-rebuild` and recompile the relevant modules:
```sh
npm install --save-dev electron-rebuild
npx electron-rebuild
```
Then run `npm run package` again.

View File

@ -30,7 +30,7 @@ def update_file_version(file_path, new_version, key="当前版本/Current Versio
file.write(line)
version = "0.6.2"
version = "0.6.3"
# py html js
@ -47,7 +47,8 @@ if __name__ == "__main__":
# index.html
file_path = "./src/index.html"
update_file_version(file_path, version, key="当前版本/Current Version: <b>v")
update_file_version(file_path, version, key="软件当前版本:<b>v")
update_file_version(file_path, version, key="Current Version: <b>v")
# package.json
file_path = "./package.json"

View File

@ -11,9 +11,10 @@ del out\EasySpider\resources\app\vs_BuildTools.exe
move out\EasySpider ..\.temp_to_pub\EasySpider_windows_x32\EasySpider
rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\Code
mkdir ..\.temp_to_pub\EasySpider_windows_x32\Code
copy ..\ExecuteStage\easyspider_executestage.py ..\.temp_to_pub\EasySpider_windows_x32\Code
copy ..\ExecuteStage\myChrome.py ..\.temp_to_pub\EasySpider_windows_x32\Code
copy ..\ExecuteStage\utils.py ..\.temp_to_pub\EasySpider_windows_x32\Code
@REM copy ..\ExecuteStage\easyspider_executestage.py ..\.temp_to_pub\EasySpider_windows_x32\Code
@REM copy ..\ExecuteStage\myChrome.py ..\.temp_to_pub\EasySpider_windows_x32\Code
@REM copy ..\ExecuteStage\utils.py ..\.temp_to_pub\EasySpider_windows_x32\Code
copy ..\ExecuteStage\*.py ..\.temp_to_pub\EasySpider_windows_x32\Code
copy ..\ExecuteStage\requirements.txt ..\.temp_to_pub\EasySpider_windows_x32\Code
copy ..\ExecuteStage\Readme.md ..\.temp_to_pub\EasySpider_windows_x32\Code
copy ..\ExecuteStage\myCode.py ..\.temp_to_pub\EasySpider_windows_x32

View File

@ -11,9 +11,10 @@ del out\EasySpider\resources\app\vs_BuildTools.exe
move out\EasySpider ..\.temp_to_pub\EasySpider_windows_x64\EasySpider
rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\Code
mkdir ..\.temp_to_pub\EasySpider_windows_x64\Code
copy ..\ExecuteStage\easyspider_executestage.py ..\.temp_to_pub\EasySpider_windows_x64\Code
copy ..\ExecuteStage\myChrome.py ..\.temp_to_pub\EasySpider_windows_x64\Code
copy ..\ExecuteStage\utils.py ..\.temp_to_pub\EasySpider_windows_x64\Code
@REM copy ..\ExecuteStage\easyspider_executestage.py ..\.temp_to_pub\EasySpider_windows_x64\Code
@REM copy ..\ExecuteStage\myChrome.py ..\.temp_to_pub\EasySpider_windows_x64\Code
@REM copy ..\ExecuteStage\utils.py ..\.temp_to_pub\EasySpider_windows_x64\Code
copy ..\ExecuteStage\*.py ..\.temp_to_pub\EasySpider_windows_x64\Code
copy ..\ExecuteStage\requirements.txt ..\.temp_to_pub\EasySpider_windows_x64\Code
copy ..\ExecuteStage\Readme.md ..\.temp_to_pub\EasySpider_windows_x64\Code
copy ..\ExecuteStage\myCode.py ..\.temp_to_pub\EasySpider_windows_x64

View File

@ -1 +1 @@
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"}
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data","lang":"zh"}

View File

@ -50,7 +50,9 @@ if (config.debug) {
}
let allWindowSockets = [];
let allWindowScoketNames = [];
task_server.start(config.webserver_port); //start local server
if(config.webserver_address.includes("localhost") || config.webserver_address.includes("127.0.0.1")) {
task_server.start(config.webserver_port); //start local server
}
let server_address = `${config.webserver_address}:${config.webserver_port}`;
const websocket_port = 8084; //目前只支持8084端口写死因为扩展里面写死了
console.log("server_address: " + server_address);
@ -84,11 +86,11 @@ console.log(process.arch);
if (process.platform === "win32" && process.arch === "ia32") {
driverPath = path.join(__dirname, "chrome_win32/chromedriver_win32.exe");
chromeBinaryPath = path.join(__dirname, "chrome_win32/chrome.exe");
execute_path = path.join(__dirname, "chrome_win32/execute.bat");
execute_path = path.join(__dirname, "chrome_win32/execute_win32.bat");
} else if (process.platform === "win32" && process.arch === "x64") {
driverPath = path.join(__dirname, "chrome_win64/chromedriver_win64.exe");
chromeBinaryPath = path.join(__dirname, "chrome_win64/chrome.exe");
execute_path = path.join(__dirname, "chrome_win64/execute.bat");
execute_path = path.join(__dirname, "chrome_win64/execute_win64.bat");
} else if (process.platform === "darwin") {
driverPath = path.join(__dirname, "chromedriver_mac64");
chromeBinaryPath = path.join(
@ -99,7 +101,7 @@ if (process.platform === "win32" && process.arch === "ia32") {
} else if (process.platform === "linux") {
driverPath = path.join(__dirname, "chrome_linux64/chromedriver_linux64");
chromeBinaryPath = path.join(__dirname, "chrome_linux64/chrome");
execute_path = path.join(__dirname, "chrome_linux64/execute.sh");
execute_path = path.join(__dirname, "chrome_linux64/execute_linux64.sh");
}
console.log(driverPath, chromeBinaryPath, execute_path);
let language = "en";
@ -112,6 +114,7 @@ let handle_pairs = {};
let socket_window = null;
let socket_start = null;
let socket_flowchart = null;
let socket_popup = null;
let invoke_window = null;
// var ffi = require('ffi-napi');
@ -148,8 +151,8 @@ function createWindow() {
server_address +
"/index.html?user_data_folder=" +
config.user_data_folder +
"&copyright=" +
config.copyright,
"&copyright=" + config.copyright +
"&lang=" + config.lang,
{extraHeaders: "pragma: no-cache\n"}
);
// 隐藏菜单栏
@ -160,9 +163,8 @@ function createWindow() {
app.quit();
}
});
//调试模式
// mainWindow.webContents.openDevTools();
// Open the DevTools.
// mainWindow.webContents.openDevTools()
}
async function findElementRecursive(driver, by, value, frames) {
@ -243,6 +245,7 @@ async function findElementAcrossAllWindows(
let handles = await driver.getAllWindowHandles();
// console.log("handles", handles);
let content_handle = current_handle;
let old_handle = current_handle;
let id = -1;
try {
id = msg.message.id;
@ -289,12 +292,12 @@ async function findElementAcrossAllWindows(
xpath = msg.xpath;
}
}
if (xpath.indexOf("Field(") >= 0 || xpath.indexOf("eval(") >= 0) {
if (xpath.indexOf("Field[") >= 0 || xpath.indexOf("eval(") >= 0) {
//两秒后通知浏览器
await new Promise((resolve) => setTimeout(resolve, 2000));
notify_browser(
'检测到XPath中包含Field("")或eval(""),试运行时无法正常定位到包含此两项表达式的元素,请在任务正式运行阶段测试是否有效。',
'Field("") or eval("") is detected in xpath, and the element containing these two expressions cannot be located normally during trial operation. Please test whether it is valid in the formal call stage.',
'检测到XPath中包含Field[""]或eval(""),试运行时无法正常定位到包含此两项表达式的元素,请在任务正式运行阶段测试是否有效。',
'Field[""] or eval("") is detected in xpath, and the element containing these two expressions cannot be located normally during trial operation. Please test whether it is valid in the formal call stage.',
"warning"
);
return null;
@ -308,7 +311,7 @@ async function findElementAcrossAllWindows(
if (h != null && handles.includes(h)) {
await driver.switchTo().window(h);
current_handle = h;
console.log("switch to handle: ", h);
console.log("Switch to handle: ", h);
}
element = await findElement(driver, By.xpath, xpath, iframe);
break;
@ -325,6 +328,12 @@ async function findElementAcrossAllWindows(
}
}
if (element == null && notifyBrowser) {
// 如果找不到元素,切换回原来的窗口
if (old_handle != null && handles.includes(old_handle)) {
await driver.switchTo().window(old_handle);
current_handle = old_handle;
console.log("Switch to handle: ", old_handle);
}
notify_browser(
"无法找到元素请检查XPath是否正确" + xpath,
"Cannot find the element, please check if the XPath is correct: " + xpath,
@ -654,7 +663,11 @@ async function beginInvoke(msg, ws) {
if (parameters.clickWay == 2){ //双击
await click_element(element, "double");
} else {
await click_element(element); //单击
if (parameters.newTab == 1){
await click_element(element, "loopClickEvery"); //新标签页打开
} else {
await click_element(element); //单击
}
}
}
let alertHandleType = parameters.alertHandleType;
@ -761,12 +774,12 @@ async function beginInvoke(msg, ws) {
keyInfo = keyInfo.replace(match[0], jsReplacedText.toString());
}
}
if (keyInfo.indexOf("Field(") >= 0 || keyInfo.indexOf("eval(") >= 0) {
if (keyInfo.indexOf("Field[") >= 0 || keyInfo.indexOf("eval(") >= 0) {
//两秒后通知浏览器
await new Promise((resolve) => setTimeout(resolve, 2000));
notify_browser(
'检测到文字中包含Field("")或eval(""),试运行时无法输入两项表达式的替换值,请在任务正式运行阶段测试是否有效。',
'Field("") or eval("") is detected in the text, and the replacement value of the two expressions cannot be entered during trial operation. Please test whether it is valid in the formal call stage.',
'检测到文字中包含Field[""]或eval(""),试运行时无法输入两项表达式的替换值,请在任务正式运行阶段测试是否有效。',
'Field[""] or eval("") is detected in the text, and the replacement value of the two expressions cannot be entered during trial operation. Please test whether it is valid in the formal call stage.',
"warning"
);
}
@ -1119,18 +1132,41 @@ async function beginInvoke(msg, ws) {
} catch {
console.log("Cannot get Cookies");
}
} else if (msg.type == 30) {
send_message_to_browser(
JSON.stringify({
type: "showAllToolboxes"
})
);
console.log("Show all toolboxes");
} else if (msg.type == 31) {
send_message_to_browser(
JSON.stringify({
type: "hideAllToolboxes"
})
);
console.log("Hide all toolboxes");
}
}
async function click_element(element, type = "click") {
try {
if (type == "loopClickEvery") {
await driver
if (process.platform === "darwin") {
await driver
.actions()
.keyDown(Key.COMMAND)
.click(element)
.keyUp(Key.COMMAND)
.perform();
} else {
await driver
.actions()
.keyDown(Key.CONTROL)
.click(element)
.keyUp(Key.CONTROL)
.perform();
}
} else if (type.includes("point(")) {
//point(10, 20)表示点击坐标为(10, 20)的位置
let point = type.substring(6, type.length - 1).split(",");
@ -1177,12 +1213,12 @@ async function execute_js(js, element, wait_time = 3) {
);
outcome = -1;
}
if (js.indexOf("Field(") >= 0 || js.indexOf("eval(") >= 0) {
if (js.indexOf("Field[") >= 0 || js.indexOf("eval(") >= 0) {
//两秒后通知浏览器
await new Promise((resolve) => setTimeout(resolve, 2000));
notify_browser(
'检测到JavaScript中包含Field("")或eval(""),试运行时无法执行两项表达式,请在任务正式运行阶段测试是否有效。',
'Field("") or eval("") is detected in JavaScript, and the two expressions cannot be executed during trial operation. Please test whether it is valid in the formal call stage.',
'检测到JavaScript中包含Field[""]或eval(""),试运行时无法执行两项表达式,请在任务正式运行阶段测试是否有效。',
'Field[""] or eval("") is detected in JavaScript, and the two expressions cannot be executed during trial operation. Please test whether it is valid in the formal call stage.',
"warning"
);
}
@ -1253,6 +1289,9 @@ wss.on("connection", function (ws) {
// console.log("socket_flowchart closed");
// });
console.log("set socket_flowchart at time: ", new Date());
} else if (msg.message.id == 3) {
socket_popup = ws;
console.log("set socket_popup at time: ", new Date());
} else {
//其他的ID是用来标识不同的浏览器标签页的
// await new Promise(resolve => setTimeout(resolve, 200));
@ -1543,6 +1582,17 @@ app.whenReady().then(() => {
path.join(task_server.getDir(), "config.json"),
JSON.stringify(config)
);
//重新读取配置文件
config = JSON.parse(fs.readFileSync(path.join(task_server.getDir(), "config.json")));
});
ipcMain.on("change-lang", function (event, arg) {
config.lang = arg;
fs.writeFileSync(
path.join(task_server.getDir(), "config.json"),
JSON.stringify(config)
);
//重新读取配置文件
config = JSON.parse(fs.readFileSync(path.join(task_server.getDir(), "config.json")));
});
createWindow();

View File

@ -1,24 +1,24 @@
{
"name": "easy-spider",
"version": "0.6.0",
"version": "0.6.3",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "easy-spider",
"version": "0.6.0",
"version": "0.6.3",
"license": "AGPL-3.0",
"dependencies": {
"cors": "^2.8.5",
"electron-squirrel-startup": "^1.0.0",
"express": "^4.19.2",
"express": "^4.21.2",
"formidable": "^3.5.0",
"http": "^0.0.1-security",
"multer": "^1.4.5-lts.1",
"node-abi": "^3.52.0",
"node-window-manager": "^2.2.4",
"selenium-webdriver": "^4.16.0",
"ws": "^8.12.0",
"selenium-webdriver": "^4.27.0",
"ws": "^8.18.0",
"xlsx": "^0.18.5"
},
"devDependencies": {
@ -30,6 +30,11 @@
"electron": "^27.1.3"
}
},
"node_modules/@bazel/runfiles": {
"version": "6.3.1",
"resolved": "https://registry.npmjs.org/@bazel/runfiles/-/runfiles-6.3.1.tgz",
"integrity": "sha512-1uLNT5NZsUVIGS4syuHwTzZ8HycMPyr6POA3FCE4GbMtc4rhoJk8aZKtNIRthJYfL+iioppi+rTfH3olMPr9nA=="
},
"node_modules/@electron-forge/cli": {
"version": "6.2.1",
"dev": true,
@ -1203,6 +1208,7 @@
},
"node_modules/balanced-match": {
"version": "1.0.2",
"dev": true,
"license": "MIT"
},
"node_modules/base64-js": {
@ -1253,9 +1259,9 @@
"license": "MIT"
},
"node_modules/body-parser": {
"version": "1.20.2",
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz",
"integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==",
"version": "1.20.3",
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz",
"integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==",
"dependencies": {
"bytes": "3.1.2",
"content-type": "~1.0.5",
@ -1265,7 +1271,7 @@
"http-errors": "2.0.0",
"iconv-lite": "0.4.24",
"on-finished": "2.4.1",
"qs": "6.11.0",
"qs": "6.13.0",
"raw-body": "2.5.2",
"type-is": "~1.6.18",
"unpipe": "1.0.0"
@ -1307,6 +1313,7 @@
},
"node_modules/brace-expansion": {
"version": "1.1.11",
"dev": true,
"license": "MIT",
"dependencies": {
"balanced-match": "^1.0.0",
@ -1314,11 +1321,12 @@
}
},
"node_modules/braces": {
"version": "3.0.2",
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
"integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
"dev": true,
"license": "MIT",
"dependencies": {
"fill-range": "^7.0.1"
"fill-range": "^7.1.1"
},
"engines": {
"node": ">=8"
@ -1667,6 +1675,7 @@
},
"node_modules/concat-map": {
"version": "0.0.1",
"dev": true,
"license": "MIT"
},
"node_modules/concat-stream": {
@ -1727,9 +1736,9 @@
}
},
"node_modules/cookie": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz",
"integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==",
"version": "0.7.1",
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.1.tgz",
"integrity": "sha512-6DnInpx7SJ2AK3+CTUE/ZM0vWTUboZCegxhC2xiIydHR9jNuTAASBrfEpHhiGOZw/nX51bHt6YQl8jsGo4y/0w==",
"engines": {
"node": ">= 0.6"
}
@ -2188,9 +2197,9 @@
"license": "MIT"
},
"node_modules/encodeurl": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
"integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
"integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
"engines": {
"node": ">= 0.8"
}
@ -2397,36 +2406,36 @@
"license": "Apache-2.0"
},
"node_modules/express": {
"version": "4.19.2",
"resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz",
"integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==",
"version": "4.21.2",
"resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz",
"integrity": "sha512-28HqgMZAmih1Czt9ny7qr6ek2qddF4FclbMzwhCREB6OFfH+rXAnuNCwo1/wFvrtbgsQDb4kSbX9de9lFbrXnA==",
"dependencies": {
"accepts": "~1.3.8",
"array-flatten": "1.1.1",
"body-parser": "1.20.2",
"body-parser": "1.20.3",
"content-disposition": "0.5.4",
"content-type": "~1.0.4",
"cookie": "0.6.0",
"cookie": "0.7.1",
"cookie-signature": "1.0.6",
"debug": "2.6.9",
"depd": "2.0.0",
"encodeurl": "~1.0.2",
"encodeurl": "~2.0.0",
"escape-html": "~1.0.3",
"etag": "~1.8.1",
"finalhandler": "1.2.0",
"finalhandler": "1.3.1",
"fresh": "0.5.2",
"http-errors": "2.0.0",
"merge-descriptors": "1.0.1",
"merge-descriptors": "1.0.3",
"methods": "~1.1.2",
"on-finished": "2.4.1",
"parseurl": "~1.3.3",
"path-to-regexp": "0.1.7",
"path-to-regexp": "0.1.12",
"proxy-addr": "~2.0.7",
"qs": "6.11.0",
"qs": "6.13.0",
"range-parser": "~1.2.1",
"safe-buffer": "5.2.1",
"send": "0.18.0",
"serve-static": "1.15.0",
"send": "0.19.0",
"serve-static": "1.16.2",
"setprototypeof": "1.2.0",
"statuses": "2.0.1",
"type-is": "~1.6.18",
@ -2435,6 +2444,10 @@
},
"engines": {
"node": ">= 0.10.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/express"
}
},
"node_modules/express/node_modules/debug": {
@ -2556,9 +2569,10 @@
}
},
"node_modules/fill-range": {
"version": "7.0.1",
"version": "7.1.1",
"resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
"integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
"dev": true,
"license": "MIT",
"dependencies": {
"to-regex-range": "^5.0.1"
},
@ -2567,12 +2581,12 @@
}
},
"node_modules/finalhandler": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.2.0.tgz",
"integrity": "sha512-5uXcUVftlQMFnWC9qu/svkWv3GTd2PfUhK/3PLkYNAe7FbqJMt3515HaxE6eRL74GdsriiwujiawdaB1BpEISg==",
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz",
"integrity": "sha512-6BN9trH7bp3qvnrRyzsBz+g3lZxTNZTbVO2EV1CS0WIcDbawYVdYvGflME/9QP0h0pYlCDBCTjYa9nZzMDpyxQ==",
"dependencies": {
"debug": "2.6.9",
"encodeurl": "~1.0.2",
"encodeurl": "~2.0.0",
"escape-html": "~1.0.3",
"on-finished": "2.4.1",
"parseurl": "~1.3.3",
@ -2736,6 +2750,7 @@
},
"node_modules/fs.realpath": {
"version": "1.0.0",
"dev": true,
"license": "ISC"
},
"node_modules/function-bind": {
@ -2885,6 +2900,7 @@
},
"node_modules/glob": {
"version": "7.2.3",
"dev": true,
"license": "ISC",
"dependencies": {
"fs.realpath": "^1.0.0",
@ -3234,6 +3250,7 @@
},
"node_modules/inflight": {
"version": "1.0.6",
"dev": true,
"license": "ISC",
"dependencies": {
"once": "^1.3.0",
@ -3343,8 +3360,9 @@
},
"node_modules/is-number": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
"integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=0.12.0"
}
@ -3713,9 +3731,12 @@
}
},
"node_modules/merge-descriptors": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
"integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w=="
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz",
"integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==",
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/merge2": {
"version": "1.4.1",
@ -3793,6 +3814,7 @@
},
"node_modules/minimatch": {
"version": "3.1.2",
"dev": true,
"license": "ISC",
"dependencies": {
"brace-expansion": "^1.1.7"
@ -4159,9 +4181,12 @@
}
},
"node_modules/object-inspect": {
"version": "1.13.1",
"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz",
"integrity": "sha512-5qoj1RUiKOMsCCNLV1CBiPYE10sziTsnmNxkAI/rZhiD63CF7IqdFGC/XzjWjpSgLf0LxXX3bDFIh0E18f6UhQ==",
"version": "1.13.2",
"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz",
"integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==",
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
@ -4357,6 +4382,7 @@
},
"node_modules/path-is-absolute": {
"version": "1.0.1",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=0.10.0"
@ -4399,9 +4425,9 @@
}
},
"node_modules/path-to-regexp": {
"version": "0.1.7",
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
"integrity": "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ=="
"version": "0.1.12",
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz",
"integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ=="
},
"node_modules/path-type": {
"version": "2.0.0",
@ -4571,11 +4597,11 @@
}
},
"node_modules/qs": {
"version": "6.11.0",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
"integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==",
"version": "6.13.0",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz",
"integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==",
"dependencies": {
"side-channel": "^1.0.4"
"side-channel": "^1.0.6"
},
"engines": {
"node": ">=0.6"
@ -4807,6 +4833,7 @@
},
"node_modules/rimraf": {
"version": "3.0.2",
"dev": true,
"license": "ISC",
"dependencies": {
"glob": "^7.1.3"
@ -4874,16 +4901,27 @@
"license": "MIT"
},
"node_modules/selenium-webdriver": {
"version": "4.16.0",
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.16.0.tgz",
"integrity": "sha512-IbqpRpfGE7JDGgXHJeWuCqT/tUqnLvZ14csSwt+S8o4nJo3RtQoE9VR4jB47tP/A8ArkYsh/THuMY6kyRP6kuA==",
"version": "4.27.0",
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.27.0.tgz",
"integrity": "sha512-LkTJrNz5socxpPnWPODQ2bQ65eYx9JK+DQMYNihpTjMCqHwgWGYQnQTCAAche2W3ZP87alA+1zYPvgS8tHNzMQ==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/SeleniumHQ"
},
{
"type": "opencollective",
"url": "https://opencollective.com/selenium"
}
],
"dependencies": {
"@bazel/runfiles": "^6.3.1",
"jszip": "^3.10.1",
"tmp": "^0.2.1",
"ws": ">=8.14.2"
"tmp": "^0.2.3",
"ws": "^8.18.0"
},
"engines": {
"node": ">= 14.20.0"
"node": ">= 14.21.0"
}
},
"node_modules/semver": {
@ -4916,9 +4954,9 @@
}
},
"node_modules/send": {
"version": "0.18.0",
"resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
"integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==",
"version": "0.19.0",
"resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz",
"integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==",
"dependencies": {
"debug": "2.6.9",
"depd": "2.0.0",
@ -4951,6 +4989,14 @@
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
},
"node_modules/send/node_modules/encodeurl": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
"integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
"engines": {
"node": ">= 0.8"
}
},
"node_modules/send/node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
@ -4984,14 +5030,14 @@
}
},
"node_modules/serve-static": {
"version": "1.15.0",
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz",
"integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==",
"version": "1.16.2",
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.2.tgz",
"integrity": "sha512-VqpjJZKadQB/PEbEwvFdO43Ax5dFBZ2UECszz8bQ7pi7wt//PWe1P6MN7eCnjsatYtBT6EuiClbjSWP2WrIoTw==",
"dependencies": {
"encodeurl": "~1.0.2",
"encodeurl": "~2.0.0",
"escape-html": "~1.0.3",
"parseurl": "~1.3.3",
"send": "0.18.0"
"send": "0.19.0"
},
"engines": {
"node": ">= 0.8.0"
@ -5414,13 +5460,11 @@
"license": "MIT"
},
"node_modules/tmp": {
"version": "0.2.1",
"license": "MIT",
"dependencies": {
"rimraf": "^3.0.0"
},
"version": "0.2.3",
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.3.tgz",
"integrity": "sha512-nZD7m9iCPC5g0pYmcaxogYKggSfLsdxl8of3Q/oIbqCqLLIO9IAF0GWjX1z9NZRHPiXv8Wex4yDCaZsgEw0Y8w==",
"engines": {
"node": ">=8.17.0"
"node": ">=14.14"
}
},
"node_modules/tmp-promise": {
@ -5434,8 +5478,9 @@
},
"node_modules/to-regex-range": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
"integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"is-number": "^7.0.0"
},
@ -5693,9 +5738,9 @@
"license": "ISC"
},
"node_modules/ws": {
"version": "8.14.2",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.14.2.tgz",
"integrity": "sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==",
"version": "8.18.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz",
"integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==",
"engines": {
"node": ">=10.0.0"
},

View File

@ -1,7 +1,7 @@
{
"name": "easy-spider",
"productName": "EasySpider",
"version": "0.6.2",
"version": "0.6.3",
"icon": "./favicon",
"description": "NoCode Visual Web Crawler",
"main": "main.js",
@ -33,14 +33,14 @@
"dependencies": {
"cors": "^2.8.5",
"electron-squirrel-startup": "^1.0.0",
"express": "^4.19.2",
"express": "^4.21.2",
"formidable": "^3.5.0",
"http": "^0.0.1-security",
"multer": "^1.4.5-lts.1",
"node-abi": "^3.52.0",
"node-window-manager": "^2.2.4",
"selenium-webdriver": "^4.16.0",
"ws": "^8.12.0",
"selenium-webdriver": "^4.27.0",
"ws": "^8.18.0",
"xlsx": "^0.18.5"
},
"config": {
@ -67,7 +67,7 @@
],
"packagerConfig": {
"icon": "./favicon",
"appVersion": "0.6.2",
"appVersion": "0.6.3",
"name": "EasySpider",
"executableName": "EasySpider",
"appCopyright": "Naibo Wang (naibowang@foxmail.com)",

View File

@ -20,9 +20,10 @@ rm out/EasySpider/resources/app/vs_BuildTools.exe
mv out/EasySpider ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
rm -rf ../.temp_to_pub/EasySpider_Linux_x64/Code
mkdir ../.temp_to_pub/EasySpider_Linux_x64/Code
cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_Linux_x64/Code
cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_Linux_x64/Code
cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_Linux_x64/Code
# cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_Linux_x64/Code
# cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_Linux_x64/Code
# cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_Linux_x64/Code
cp ../ExecuteStage/*.py ../.temp_to_pub/EasySpider_Linux_x64/Code
cp ../ExecuteStage/requirements.txt ../.temp_to_pub/EasySpider_Linux_x64/Code
cp ../ExecuteStage/Readme.md ../.temp_to_pub/EasySpider_Linux_x64/Code
cp ../ExecuteStage/myCode.py ../.temp_to_pub/EasySpider_Linux_x64

View File

@ -20,9 +20,10 @@ rm -r ../.temp_to_pub/EasySpider_MacOS/EasySpider.app/Contents/Resources/app/use
rm -r ../.temp_to_pub/EasySpider_MacOS/EasySpider.app/Contents/Resources/app/TempUserDataFolder
rm -rf ../.temp_to_pub/EasySpider_MacOS/Code
mkdir ../.temp_to_pub/EasySpider_MacOS/Code
cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_MacOS/Code
cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_MacOS/Code
cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_MacOS/Code
# cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_MacOS/Code
# cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_MacOS/Code
# cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_MacOS/Code
cp ../ExecuteStage/*.py ../.temp_to_pub/EasySpider_MacOS/Code
cp ../ExecuteStage/requirements.txt ../.temp_to_pub/EasySpider_MacOS/Code
cp ../ExecuteStage/Readme.md ../.temp_to_pub/EasySpider_MacOS/Code
cp ../ExecuteStage/myCode.py ../.temp_to_pub/EasySpider_MacOS

View File

@ -66,6 +66,7 @@ if (!fs.existsSync(path.join(getDir(), "config.json"))) {
webserver_port: 8074,
user_data_folder: "./user_data",
debug: false,
lang: "-",
copyright: 0,
sys_arch: require("os").arch(),
mysql_config_path: "./mysql_config.json",
@ -121,6 +122,12 @@ exports.start = function (port = 8074) {
res.setHeader("Access-Control-Allow-Origin", "*"); // 设置可访问的源
// 解析参数
const pathName = url.parse(req.url).pathname;
const safeBase = path.join(__dirname, "src");
const safeJoin = (base, target) => {
const targetPath = "." + path.posix.normalize("/" + target);
return path.join(base, targetPath);
};
if (pathName == "/excelUpload" && req.method.toLowerCase() === "post") {
// // parse a file upload
// let form = new formidable.IncomingForm();
@ -160,8 +167,16 @@ exports.start = function (port = 8074) {
else {
//如果有后缀名, 则为前端请求
// console.log(path.join(__dirname,"src/taskGrid", pathName));
const filePath = safeJoin(safeBase, pathName);
if (!filePath.startsWith(safeBase)) {
res.writeHead(400, { "Content-Type": 'text/html;charset="utf-8"' });
res.end("Invalid path");
return;
}
fs.readFile(
path.join(__dirname, "src", pathName),
filePath,
async (err, data) => {
if (err) {
res.writeHead(404, {
@ -200,7 +215,7 @@ exports.start = function (port = 8074) {
let item = {
id: task.id,
name: task.name,
url: task.url,
url: task.links.split("\n")[0],
mtime: stat.mtime,
links: task.links,
desc: task.desc,
@ -445,6 +460,10 @@ exports.start = function (port = 8074) {
"utf8"
);
config_file = JSON.parse(config_file);
let lang = config_file["lang"];
if(lang == undefined){
lang = "-";
}
res.write(JSON.stringify(config_file));
res.end();
} else if (pathName == "/setUserDataFolder") {

View File

@ -32,7 +32,7 @@
<body>
<div id="app">
<div style="padding: 10px; text-align: center;vertical-align: middle;" v-if="init">
<div style="padding: 10px; text-align: center;vertical-align: middle;" v-if="lang=='-'">
<h5 style="margin-top: 20px">选择语言/Select Language</h5>
<p><a @click="changeLang('zh')" class="btn btn-outline-primary btn-lg"
@ -40,9 +40,6 @@
<p><a @click="changeLang('en')" class="btn btn-outline-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;">English</a></p>
<p style="font-size: 17px">当前版本/Current Version: <b>v0.6.2</b></p>
<p style="font-size: 17px"><a href="https://github.com/NaiboWang/EasySpider/releases"
target="_blank">Github</a>最新版本/Newest Version<b>{{newest_version}}</b></p>
<!-- <p>如发现新版本更新可从以下Github仓库下载最新版本使用/If a new version is found, you can download the latest version from the following Github repository:</p>-->
<!-- <p></p>-->
<div class="img-container">
@ -64,8 +61,8 @@
<textarea class="form-control"
style="margin:0 auto;width:90%; color:black; height: 450px; min-height: 200px; background: white"
readonly>
This software is intended for educational and communication purposes only. It is strictly prohibited to use the software for any illegal activities or operations, such as crawling government/military websites that are not allowed to be crawled. The user bears all consequences resulting from the use of this software and the author shall not be held responsible or liable in any way. Furthermore, the software is protected by patent rights. If you intend to use it for commercial purposes or profit-making activities, such as using the software for client orders, selling the collected data, please contact author: naibowang@foxmail.com for patent authorization and payment operations: https://www.patentguru.com/cn/search?q=一种自定义提取流程的服务封装系统
For individual users, EasySpider is a completely free and ad-free open-source software. The development and maintenance of the software rely solely on the author's voluntary efforts. Therefore, you can choose to support the author, allowing them to have more enthusiasm and energy to maintain this software. Alternatively, if you have profited from using this software, you are welcome to support the author through the following methods:
This software is intended for educational and communication purposes only. It is strictly prohibited to use the software for any illegal activities or operations, such as crawling government/military websites that are not allowed to be crawled. The user bears all consequences resulting from the use of this software and the author shall not be held responsible or liable in any way.
EasySpider is a completely free and ad-free open-source software. The development and maintenance of the software rely solely on the author's voluntary efforts. Therefore, you can choose to support the author, allowing them to have more enthusiasm and energy to maintain this software. Alternatively, if you have profited from using this software, you are welcome to support the author through the following methods:
1. PayPal account: naibowang, or scan the QR code provided in the software package.
2. Alipay account: naibowang@foxmail.com, or scan the QR code provided in the software package.
@ -92,6 +89,9 @@ For individual users, EasySpider is a completely free and ad-free open-source so
<a href="https://www.easyspider.cn/index_english.html" target="_blank"
style="text-align: center; font-size: 18px">Browse official website to watch tutorials</a>
</p>
<p style="font-size: 17px">Current Version: <b>v0.6.3</b></p>
<p style="font-size: 17px"><a href="https://github.com/NaiboWang/EasySpider/releases"
target="_blank">Newest</a> Version: <b>{{newest_version}}</b></p>
<div class="img-container">
<!-- <h5>Producer</h5>-->
<a href="https://www.zju.edu.cn" alt="Zhejiang University" target="_blank"><img
@ -164,9 +164,9 @@ For individual users, EasySpider is a completely free and ad-free open-source so
<textarea class="form-control"
style="margin:0 auto;width:90%; color:black; height: 480px; min-height: 200px; background: white"
readonly>
本软件仅供学习交流使用,严禁使用软件进行任何违法违规的操作,如爬取不允许爬取的政府/军事机关网站等。使用本软件所造成的一切后果由使用者自负,与作者本人无关,作者不会承担任何责任。同时软件受到专利权保护如要用于商业用途如使用软件进行盈利接单用于公司业务或出售采集到的数据等请邮件联系作者naibowang@foxmail.com进行专利授权等付费操作https://www.patentguru.com/cn/search?q=一种自定义提取流程的服务封装系统
本软件仅供学习交流使用,严禁使用软件进行任何违法违规的操作,如爬取不允许爬取的政府/军事机关网站等。使用本软件所造成的一切后果由使用者自负,与作者本人无关,作者不会承担任何责任。
对于个人使用者来说,易采集EasySpider是一款完全免费无广告的开源软件软件开发和维护全靠作者用爱发电因此您可以选择支持作者让作者有更多的热情和精力维护此软件或者您使用了此软件进行了盈利欢迎您通过下面的方式支持作者
易采集EasySpider是一款完全免费无广告的开源软件软件开发和维护全靠作者用爱发电因此您可以选择支持作者让作者有更多的热情和精力维护此软件或者您使用了此软件进行了盈利欢迎您通过下面的方式支持作者
1、支付宝账号naibowang@foxmail.com也可以扫描软件包中带的二维码。
2、微信收款扫描软件包中带的二维码。
@ -191,6 +191,9 @@ For individual users, EasySpider is a completely free and ad-free open-source so
<a href="https://www.easyspider.cn?lang=zh" target="_blank"
style="text-align: center; font-size: 18px">点此访问官网查看文档/视频教程</a>
</p>
<p style="font-size: 17px">软件当前版本:<b>v0.6.3</b></p>
<p style="font-size: 17px"><a href="https://github.com/NaiboWang/EasySpider/releases"
target="_blank">官网</a>最新版本:<b>{{newest_version}}</b></p>
<div class="img-container">
<!-- <h5>出品方</h5>-->
<a href="https://www.zju.edu.cn" alt="浙江大学" target="_blank"><img src="img/zju.png"></a>

View File

@ -22,7 +22,7 @@ let app = Vue.createApp({
data() {
return {
init: true,
lang: 'zh',
lang: '-',
user_data_folder: getUrlParam("user_data_folder"),
copyright: 0,
step: 0,
@ -34,6 +34,10 @@ let app = Vue.createApp({
if(this.copyright == 0){
this.step = -1;
}
this.lang = getUrlParam("lang");
if (this.lang == 'undefined' || this.lang == '') {
this.lang = '-';
}
// 发送GET请求获取GitHub的Release API响应
const request = new XMLHttpRequest();
request.open('GET', `https://api.github.com/repos/NaiboWang/EasySpider/releases/latest`);
@ -52,8 +56,9 @@ let app = Vue.createApp({
},
methods: {
changeLang(lang = 'zh') {
this.init = false;
// this.init = false;
this.lang = lang;
window.electronAPI.changeLang(lang);
},
acceptAgreement() {
this.step = 0;

View File

@ -11,4 +11,5 @@ contextBridge.exposeInMainWorld('electronAPI', {
startDesign: (lang="en", user_data_folder = '', mobile=false) => ipcRenderer.send('start-design', lang, user_data_folder, mobile),
startInvoke: (lang="en") => ipcRenderer.send('start-invoke', lang),
acceptAgreement: () => ipcRenderer.send('accept-agreement'),
changeLang: (lang="en") => ipcRenderer.send('change-lang', lang)
})

View File

@ -580,7 +580,7 @@ Please note that this feature does not support assigning values to variables. In
Loop based on the expression value of Python code. Here are some examples:
1. Return relevant values of the current browser object. Use `self.browser` to refer to the current browser being operated. You can directly use Selenium's API to perform operations, such as `self.browser.find_element(By.CSS_SELECTOR, "body").text=="123"`, which checks whether the current page contains the text "123".
2. Return the value of a custom global variable: `self.myVar`
3. Return the result of a conditional statement: `self.myVar == 1`
3. Return the result of a conditional statement: `self.myVar > 1`
4. Determining whether the value extracted from a certain field is equal to the value of a certain variable: self.outputParameters["field name"] == self.myVar
If the expression returns a value greater than 0 or evaluates to True, the loop continues; otherwise, it stops.
</pre>

View File

@ -579,8 +579,8 @@ print(emotlib.emoji()) # 使用其中的函数。
<pre class="form-control" style="background: white; margin-top: 20px; min-height: 220px; font-size: 15px!important; word-wrap: break-word; white-space: pre-wrap; border-radius: 0; border: 1px solid" disabled v-if='parseInt(loopType) == 7'>请先阅读此说明再在上方输入框不是本框写具体代码如果要执行大量代码可以直接写outside:myCode.py这样程序就会读取并执行EasySpider目录下的myCode.py中的代码。
根据Python代码的表达式值来决定是否循环示例
1. 返回当前浏览器对象的相关值用self.browser表示当前操作的浏览器可直接用selenium的API进行操作如self.browser.find_element(By.CSS_SELECTOR, "body").text=="123"表示判断当前页面是否为123这个文本。
2. 返回自定义全局变量的值self.myVar,如果
3. 返回条件判断的值self.myVar == 1
2. 返回自定义全局变量的值self.myVar
3. 返回条件判断的值self.myVar > 1
4. 判断某个字段提取的值是否等于某个变量的值self.outputParameters["字段名"] == self.myVar
以上表达式返回值大于0或为真则继续循环否则停止循环。
</pre>

View File

@ -91,7 +91,7 @@
value="about:blank"></input>
<label><a href="https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction"
target="_blank">{{`Click Here~点击这里` | lang}}</a> {{`Here to see argument instruction.~这里查看参数配置说明。` | lang}}</label>
<label v-if="OS=='darwin'">{{`对于MacOS系统EasySpider提供了两个不同的执行程序分别为easyspider_executestage和easyspider_executestage_full前者执行时加载速度较快并提供了除OCR识别和数据去重以外的全部功能后者则提供了包括OCR识别和数据去重在内的全部功能但运行时加载速度较慢需要等待2-10分钟才能执行程序请根据自己的需求选择执行哪个程序。~For MacOS system, EasySpider provides two different execution programs, 'easyspider_executestage' and 'easyspider_executestage_full', the former loads faster when executing, and provides all functions except OCR recognition and data deduplication; the latter provides all functions including OCR recognition and data deduplication, but the loading speed is slower when running, and it takes 2-10 minutes to wait for the program to execute, please choose which program to execute according to your needs.` | lang}}</label>
<label v-if="OS=='MacOS'">{{`对于MacOS系统EasySpider提供了两个不同的执行程序分别为easyspider_executestage和easyspider_executestage_full前者执行时加载速度较快并提供了除OCR识别和数据去重以外的全部功能后者则提供了包括OCR识别和数据去重在内的全部功能但运行时加载速度较慢需要等待2-10分钟才能执行程序请根据自己的需求选择执行哪个程序。~For MacOS system, EasySpider provides two different execution programs, 'easyspider_executestage' and 'easyspider_executestage_full', the former loads faster when executing, and provides all functions except OCR recognition and data deduplication; the latter provides all functions including OCR recognition and data deduplication, but the loading speed is slower when running, and it takes 2-10 minutes to wait for the program to execute, please choose which program to execute according to your needs.` | lang}}</label>
<label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider can quit when executing command for ease of timed execution, and you can set --read_type to "remote" for remote execution):~请在EasySpider目录下打开命令行工具Terminal Windows请使用PowerShell而不是CMD然后复制Command/Ctrl + c和运行以下命令以执行任务执行命令时可以退出EasySpider以方便定时执行如需要远程调用则需要将--read_type设置为remote并设置远程地址` | lang }}</label>
<textarea class="form-control" style="height:150px">cd {{easyspider_location}}
{{command}} --config_folder "{{config_folder}}" --headless 0 --read_type local --config_file_name config.json --saved_file_name </textarea>
@ -348,7 +348,7 @@
config_folder: "",
easyspider_location: "",
mysql_config_path: "",
OS: "win32",
OS: "Windows",
}, mounted() {
$.get(this.backEndAddressServiceWrapper + "/getConfig", function (result) {
app.$data.user_data_folder = result.user_data_folder;
@ -412,7 +412,7 @@
form_data.append('file', $('#excelFile').prop('files')[0]);
// console.log(app.$data.backEndAddressServiceWrapper + "/excelUpload",)
$.ajax({
url: app.$data.backEndAddressServiceWrapper.replace("8074", "8075") + "/excelUpload",
url: "http://localhost:8075/excelUpload",
type: 'POST',
data: form_data,
processData: false,
@ -559,12 +559,14 @@
};
app.$data.ID = result;
ws.send(JSON.stringify(message));
$.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
if (OSInfo.version == 'darwin') {
changeCommand();
$('#myModal').modal('show');
}
});
// 使用函数并打印结果
const systemInfo = detectOperatingSystemAndArch();
// $.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
if (systemInfo.OS == 'MacOS') {
changeCommand();
$('#myModal').modal('show');
}
// });
});
// }
},
@ -574,15 +576,17 @@
});
function changeCommand() {
$.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
app.$data.OS = OSInfo.version;
if (OSInfo.version == 'win32' && OSInfo.bit == 'x64') {
// $.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
// app.$data.OS = systemInfo.OS;
const systemInfo = detectOperatingSystemAndArch();
app.$data.OS = systemInfo.OS;
if (systemInfo.OS == 'Windows' && systemInfo.architecture == 'x64') {
app.$data.command = "./EasySpider/resources/app/chrome_win64/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
} else if (OSInfo.version == 'win32' && OSInfo.bit == 'ia32') {
} else if (systemInfo.OS == 'Windows' && systemInfo.architecture == 'ia32') {
app.$data.command = "./EasySpider/resources/app/chrome_win32/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
} else if (OSInfo.version == 'linux') {
} else if (systemInfo.OS == 'Linux') {
app.$data.command = "./EasySpider/resources/app/chrome_linux64/easyspider_executestage --ids '[" + app.$data.ID.toString() + "]' --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
} else if (OSInfo.version == 'darwin') {
} else if (systemInfo.OS == 'MacOS') {
if (getUrlParam("lang") == "zh") {
app.$data.easyspider_location = "你的EasySpider文件夹cd /Users/" + app.$data.config_folder.split("/")[2] + "/Downloads/EasySpider_MacOS";
} else {
@ -590,7 +594,7 @@
}
app.$data.command = "./easyspider_executestage --ids '[" + app.$data.ID.toString() + "]' --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
}
});
// });
}
$.get(app.$data.backEndAddressServiceWrapper + "/queryTask?id=" + sId, function (result) {

View File

@ -127,3 +127,27 @@ document.onkeydown = function (e) {
}
}
}
function detectOperatingSystemAndArch() {
const platform = navigator.platform.toLowerCase();
const userAgent = navigator.userAgent.toLowerCase();
let OS = 'Unknown';
let architecture = 'Unknown';
// 判断操作系统类型
if (platform.includes('win')) {
OS = 'Windows';
} else if (platform.includes('mac')) {
OS = 'MacOS';
} else if (platform.includes('linux')) {
OS = 'Linux';
}
// 判断操作系统位数
if (userAgent.includes('wow64') || userAgent.includes('win64') || platform.includes('x86_64') || platform.includes('amd64')) {
architecture = 'x64';
} else {
architecture = 'ia32';
}
return { OS, architecture };
}

View File

@ -491,7 +491,7 @@ if (mobile == "true") {
}
let serviceInfo = {
"version": "0.6.2"
"version": "0.6.3"
};
function saveService(type) {
@ -625,7 +625,7 @@ function saveService(type) {
"links": links,
"create_time": $("#create_time").val(),
"update_time": formatDateTime(new Date()),
"version": "0.6.2",
"version": "0.6.3",
"saveThreshold": saveThreshold,
// "cloudflare": cloudflare,
"quitWaitTime": parseInt($("#quitWaitTime").val()),

View File

@ -0,0 +1,3 @@
const path = require("path");
const task_server = require(path.join(__dirname, "server.js"));
task_server.start(8074); //start local server

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":321,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-04-22 07:02:02","update_time":"2024-04-22 07:02:16","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}

View File

@ -0,0 +1 @@
{"id":322,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2024-04-22 08:13:15","update_time":"2024-04-22 08:13:33","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}

View File

@ -0,0 +1 @@
{"id":323,"name":"新web采集任务","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"","update_time":"2024-08-10 17:29:04","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}}]}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":325,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-12-30 22:37:29","update_time":"2024-12-30 22:37:43","version":"0.6.3","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"0暖心2024 总书记的贴心话"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://www.baidu.com/s?wd=%E6%9A%96%E5%BF%832024+%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%84%E8%B4%B4%E5%BF%83%E8%AF%9D&sa=fyb_n_homepage&rsv_dl=fyb_n_homepage&from=super&cl=3&tn=baidutop10&fr=top1000&rsv_idx=2&hisfilter=1"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li[1]/a[1]","//a[contains(., '0暖心2024 总')]","//a[@class='title-content c-link c-font-medium c-line-clamp1']","/html/body/div[last()-4]/div[last()-3]/div[last()-3]/div/div/div/ul/li[last()-9]/a"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"0暖心2024 总书记的贴心话"}],"unique_index":"8rtq2is658sm5b58osr","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://www.baidu.com/s?wd=%E6%9A%96%E5%BF%832024+%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%84%E8%B4%B4%E5%BF%83%E8%AF%9D&sa=fyb_n_homepage&rsv_dl=fyb_n_homepage&from=super&cl=3&tn=baidutop10&fr=top1000&rsv_idx=2&hisfilter=1"}],"unique_index":"8rtq2is658sm5b58osr","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}

File diff suppressed because one or more lines are too long

View File

@ -48,7 +48,7 @@ def copy_folder(source_folder, destination_folder):
def get_chrome_version():
version = "120"
version = "131"
if sys.platform == "win32":
version_re = re.compile(r"^[1-9]\d*\.\d*.\d*")
try:
@ -90,6 +90,8 @@ old_driver_version = {
}
if __name__ == "__main__":
os.system("npm install -g extract-stealth-evasions") # 安装stealth.min.js
os.system("npx extract-stealth-evasions") # 提取stealth.min.js
driver_downloads = []
response = requests.get(chrome_driver_url)
if response.status_code == 200:
@ -150,7 +152,7 @@ if __name__ == "__main__":
for folder in os.listdir("./chrome_win64"):
if folder[0].isdigit() and os.path.isdir("./chrome_win64/"+folder):
shutil.rmtree("./chrome_win64/"+folder+"/Installer") # 删除Installer文件夹
copy_file("./execute_win64.bat", "./chrome_win64/execute.bat")
copy_file("./execute_win64.bat", "./chrome_win64/execute_win64.bat")
copy_file("./stealth.min.js", "./chrome_win64/stealth.min.js")
try:
copy_file(
@ -177,7 +179,7 @@ if __name__ == "__main__":
for folder in os.listdir("./chrome_win32"):
if folder[0].isdigit() and os.path.isdir("./chrome_win32/"+folder):
shutil.rmtree("./chrome_win32/"+folder+"/Installer") # 删除Installer文件夹
copy_file("./execute_win32.bat", "./chrome_win32/execute.bat")
copy_file("./execute_win32.bat", "./chrome_win32/execute_win32.bat")
copy_file("./stealth.min.js", "./chrome_win32/stealth.min.js")
try:
copy_file(
@ -201,7 +203,7 @@ if __name__ == "__main__":
if os.path.exists("./chrome_linux64"):
shutil.rmtree("./chrome_linux64")
copy_folder(linux_chrome_path, "./chrome_linux64")
copy_file("./execute_linux64.sh", "./chrome_linux64/execute.sh")
copy_file("./execute_linux64.sh", "./chrome_linux64/execute_linux64.sh")
copy_file("./stealth.min.js", "./chrome_linux64/stealth.min.js")
try:
copy_file(
@ -216,7 +218,7 @@ if __name__ == "__main__":
finally:
# Change Linux file permissions
os.chmod("./chrome_linux64/chromedriver_linux64", 0o755)
os.chmod("./chrome_linux64/execute.sh", 0o755)
os.chmod("./chrome_linux64/execute_linux64.sh", 0o755)
shutil.rmtree("./chromedrivers")
elif sys.platform == "darwin" and platform.architecture()[0] == "64bit":
processor = get_processor_info()

View File

@ -12,8 +12,9 @@
"justMyCode": false,
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--ids", "[83]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"--read_type", "remote"]
"args": ["--ids", "[0]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"--read_type", "remote",
]
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
}
]

View File

@ -1,4 +1,8 @@
# 环境编译说明|Environment Compilation Instruction
## 视频教程
[从源代码编译程序并设计运行和调试任务指南基于Ubuntu24.04](https://www.bilibili.com/video/BV1VE421P7yj/)
# 环境编译说明 | Environment Compilation Instruction
EasySpider分三部分
@ -18,20 +22,20 @@ EasySpider is divided into three parts:
This section covers the compilation instructions for the `Execution stage program`.
## 建议编译顺序|Suggested Compilation Order
## 建议编译顺序 | Suggested Compilation Order
1. 编译浏览器扩展,否则在主程序执行时会提示找不到`EasySpider_zh.crx`的错误。
2. 编译主程序,此时主程序可以正常运行,但无法执行任务,只能设计任务。
3. 编译执行阶段程序,否则无法执行程序,只能设计程序
3. 编译执行阶段程序,否则无法执行任务,只能设计任务
-----
1. Compile the browser extension, otherwise an error will be prompted when the main program is executed that `EasySpider_en.crx` cannot be found.
2. Compile the main program, at this time the main program can run normally, but can not execute the task, can only design the task.
3. Compile the execution stage program, otherwise the program cannot be executed, can only design the program.
3. Compile the execution stage program, otherwise the task cannot be executed, can only design the task.
## 环境构建|Environment Setup
## 环境构建 | Environment Setup
1. 安装Python 3.7及以上版本并添加至系统环境变量:[https://www.python.org/downloads/](https://www.python.org/downloads/)。
2. 安装`pip3`并添加至系统环境变量Windows安装python后会自带pipLinux和MacOS安装方式请自行搜索
@ -51,7 +55,7 @@ This section covers the compilation instructions for the `Execution stage progra
pip3 install -r requirements.txt
```
## 运行说明|Run Instruction
## 运行说明 | Run Instruction
运行程序前,确保已经完成了`ElectronJS`文件夹下`主程序`的编译,保证`chrome`文件夹和`chromedriver`环境已经就绪,同时**EasySpider主程序已在运行中**。
@ -75,13 +79,13 @@ python3 easyspider_executestage.py --ids [1]
The above is an example command to run a task with the ID of `1`. For more information on command-line parameters, please refer to: [Argument Instruction](https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction) on the project's GitHub Wiki.
### VS Code调试|VS Code Debug
### VS Code调试 | VS Code Debug
可以用VS Code打开此文件夹即可调试程序可修改`.vscode`下的`launch.json`文件中的调试参数,调试说明参考:[https://zhuanlan.zhihu.com/p/41189402](https://zhuanlan.zhihu.com/p/41189402)。
You can use VS Code to open this folder and debug the program. You can modify the debugging parameters in the launch.json file located under the .vscode folder. For instructions on debugging with VSCode, you can refer to this guide: [Debugging Python with Visual Studio Code](https://code.visualstudio.com/docs/python/debugging).
## 打包说明|Package Instruction
## 打包说明 | Package Instruction
如果想要在主程序直接点击`本地直接运行`按钮即可执行程序,则需要打包程序为可执行程序。

27
ExecuteStage/constants.py Normal file
View File

@ -0,0 +1,27 @@
from enum import unique, IntEnum
@unique
class WriteMode(IntEnum):
Create = 0 # 新建模式|Create Mode
Append = 1 # 追加模式|Append Mode
MySQL = 2 # MySQL模式|MySQL Mode
Json = 3 # Json模式|Json Mode
@unique
class DataWriteMode(IntEnum):
Append = 1 # 追加模式|Append Mode
Cover = 2 # 覆盖模式|Cover Mode
Rename = 3 # 重命名模式|Rename Mode
@unique
class GraphOption(IntEnum):
Get = 1 # 打开网页操作|Open Web
Click = 2 # 点击操作|Click
Extract = 3 # 提取数据操作|Extract Data
Input = 4 # 输入操作|Input
Custom = 5 # 自定义操作|Custom
Move = 7 # 移动操作|Move
Loop = 8 # 循环操作|Loop

View File

@ -9,6 +9,7 @@ import threading
# import undetected_chromedriver as uc
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
from constants import WriteMode, DataWriteMode, GraphOption
from myChrome import MyChrome
from threading import Thread, Event
from PIL import Image
@ -31,7 +32,6 @@ from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from pynput.keyboard import Key, Listener
from datetime import datetime
import io # 遇到错误退出时应执行的代码
import json
@ -73,36 +73,24 @@ desired_capabilities["pageLoadStrategy"] = "none"
class BrowserThread(Thread):
def __init__(self, browser_t, id, service, version, event, saveName, config, option):
def __init__(self, browser_t, id, service, version, event, saveName, config, option, commandline_config=""):
Thread.__init__(self)
self.logs = io.StringIO()
try:
self.log = bool(service["recordLog"])
except:
self.log = True
self.log = bool(service.get("recordLog", True))
self.browser = browser_t
self.option = option
self.config = config
self.commandline_config = commandline_config
self.version = version
self.totalSteps = 0
self.id = id
self.event = event
try:
self.saveName = service["saveName"] # 保存文件的名字
except:
now = datetime.now()
# 将时间格式化为精确到秒的字符串
self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
now = datetime.now()
self.saveName = service.get("saveName", now.strftime("%Y_%m_%d_%H_%M_%S")) # 保存文件的名字
self.OUTPUT = ""
self.SAVED = False
self.BREAK = False
self.CONTINUE = False
try:
maximizeWindow = service["maximizeWindow"]
except:
maximizeWindow = 0
if maximizeWindow == 1:
self.browser.maximize_window()
self.browser.maximize_window() if service.get("maximizeWindow") == 1 else ...
# 名称设定
if saveName != "": # 命令行覆盖保存名称
self.saveName = saveName # 保存文件的名字
@ -120,16 +108,18 @@ class BrowserThread(Thread):
os.mkdir(self.downloadFolder + "/files")
if not os.path.exists(self.downloadFolder + "/images"):
os.mkdir(self.downloadFolder + "/images")
if not os.path.exists(self.downloadFolder + "/screenshots"):
os.mkdir(self.downloadFolder + "/screenshots")
self.getDataStep = 0
self.startSteps = 0
try:
startFromExit = service["startFromExit"] # 从上次退出的步骤开始
if startFromExit == 1:
if service.get("startFromExit", 0) == 1:
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r',
encoding='utf-8-sig') as file_obj:
self.startSteps = int(file_obj.read()) # 读取已执行步数
except:
pass
except Exception as e:
self.print_and_log(f"读取steps.txt失败原因{str(e)}")
if self.startSteps != 0:
self.print_and_log("此模式下任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
self.startSteps, "条。")
@ -137,7 +127,7 @@ class BrowserThread(Thread):
"will start from the last step, before we already collected", self.startSteps, " items.")
else:
self.print_and_log("此模式下任务ID", self.id,
"将从头F开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
"将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
self.print_and_log("In this mode, task ID", self.id,
"will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
stealth_path = driver_path[:driver_path.find(
@ -145,13 +135,12 @@ class BrowserThread(Thread):
with open(stealth_path, 'r') as f:
js = f.read()
self.print_and_log("Loading stealth.min.js")
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': js}) # TMALL 反扒
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js}) # TMALL 反扒
self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
WebDriverWait(self.browser, 10)
@ -164,75 +153,65 @@ class BrowserThread(Thread):
self.monitor_thread.start()
# self.browser.get('about:blank')
self.procedure = service["graph"] # 程序执行流程
try:
self.maxViewLength = service["maxViewLength"] # 最大显示长度
except:
self.maxViewLength = 15
try:
self.outputFormat = service["outputFormat"] # 输出格式
except:
self.outputFormat = "csv"
try:
self.task_version = service["version"] # 任务版本
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
pass
else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
if service["version"] != version:
self.print_and_log("版本不一致,请使用" +
service["version"] + "版本的EasySpider运行该任务")
self.print_and_log("Version not match, please use EasySpider " +
service["version"] + " to run this task!")
self.browser.quit()
sys.exit()
except: # 0.2.0版本没有version字段所以直接退出
self.maxViewLength = service.get("maxViewLength", 15) # 最大显示长度
self.outputFormat = service.get("outputFormat", "csv") # 输出格式
self.save_threshold = service.get("saveThreshold", 10) # 保存最低阈值
self.dataWriteMode = service.get("dataWriteMode", DataWriteMode.Append.value) # 数据写入模式1为追加2为覆盖3为重命名文件
self.task_version = service.get("version", "") # 任务版本
if not self.task_version:
self.print_and_log("版本不一致请使用v0.2.0版本的EasySpider运行该任务")
self.print_and_log(
"Version not match, please use EasySpider v0.2.0 to run this task!")
self.print_and_log("Version not match, please use EasySpider v0.2.0 to run this task!")
self.browser.quit()
sys.exit()
try:
self.save_threshold = service["saveThreshold"] # 保存最低阈值
except:
self.save_threshold = 10
try:
self.links = list(
filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
except:
if self.task_version >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
pass
elif self.task_version != version: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
self.print_and_log(f"版本不一致,请使用{self.task_version}版本的EasySpider运行该任务")
self.print_and_log(f"Version not match, please use EasySpider {self.task_version} to run this task!")
self.browser.quit()
sys.exit()
service_links = service.get("links")
if service_links:
self.links = list(filter(isnotnull, service_links.split("\n"))) # 要执行的link的列表
else:
self.links = list(filter(isnotnull, service["url"])) # 要执行的link
self.OUTPUT = [] # 采集的数据
try:
self.dataWriteMode = service["dataWriteMode"] # 数据写入模式1为追加2为覆盖3为重命名文件
except:
self.dataWriteMode = 1
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx" or self.outputFormat == "json":
if self.outputFormat in ["csv", "txt", "xlsx", "json"]:
if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
if self.dataWriteMode == 2:
if self.dataWriteMode == DataWriteMode.Cover.value:
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
elif self.dataWriteMode == 3:
elif self.dataWriteMode == DataWriteMode.Rename.value:
i = 2
while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat):
i = i + 1
self.saveName = self.saveName + '_' + str(i)
self.print_and_log("文件已存在,已重命名为", self.saveName)
self.writeMode = 1 # 写入模式0为新建1为追加
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
self.writeMode = WriteMode.Create.value # 写入模式0为新建1为追加
if self.outputFormat in ['csv', 'txt', 'xlsx']:
if not os.path.exists(f"Data/Task_{str(self.id)}/{self.saveName}.{self.outputFormat}"):
self.OUTPUT.append([]) # 添加表头
self.writeMode = 0
self.writeMode = WriteMode.Create.value
elif self.outputFormat == "json":
self.writeMode = 3 # JSON模式无需判断是否存在文件
self.writeMode = WriteMode.Json.value # JSON模式无需判断是否存在文件
elif self.outputFormat == "mysql":
self.mysql = myMySQL(config["mysql_config_path"])
self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2)
self.writeMode = 2
if self.writeMode == 0:
self.mysql.create_table(self.saveName, service["outputParameters"],
remove_if_exists=self.dataWriteMode == DataWriteMode.Cover.value)
self.writeMode = WriteMode.MySQL.value # MySQL模式
if self.writeMode == WriteMode.Create.value:
self.print_and_log("新建模式|Create Mode")
elif self.writeMode == 1:
elif self.writeMode == WriteMode.Append.value:
self.print_and_log("追加模式|Append Mode")
elif self.writeMode == 2:
elif self.writeMode == WriteMode.MySQL.value:
self.print_and_log("MySQL模式|MySQL Mode")
elif self.writeMode == 3:
elif self.writeMode == WriteMode.Json.value:
self.print_and_log("JSON模式|JSON Mode")
self.containJudge = service["containJudge"] # 是否含有判断语句
self.outputParameters = {}
self.service = service
@ -245,191 +224,140 @@ class BrowserThread(Thread):
if param["name"] not in self.outputParameters.keys():
self.outputParameters[param["name"]] = ""
self.dataNotFoundKeys[param["name"]] = False
try:
self.outputParametersTypes.append(param["type"])
except:
self.outputParametersTypes.append("text")
try:
self.outputParametersRecord.append(
bool(param["recordASField"]))
except:
self.outputParametersRecord.append(True)
self.outputParametersTypes.append(param.get("type", "text"))
self.outputParametersRecord.append(bool(param.get("recordASField", True)))
# 文件叠加的时候不添加表头
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
if self.writeMode == 0:
self.OUTPUT[0].append(param["name"])
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create.value:
self.OUTPUT[0].append(param["name"])
self.urlId = 0 # 全局记录变量
self.preprocess() # 预处理,优化提取数据流程
try:
self.inputExcel = service["inputExcel"] # 输入Excel
except:
self.inputExcel = ""
self.inputExcel = service.get("inputExcel", "") # 输入Excel
self.readFromExcel() # 读取Excel获得参数值
# 检测如果没有复杂的操作,优化提取数据流程
def preprocess(self):
for node in self.procedure:
try:
iframe = node["parameters"]["iframe"]
except:
node["parameters"]["iframe"] = False
for index_node, node in enumerate(self.procedure):
parameters: dict = node["parameters"]
iframe = parameters.get('iframe')
option = node["option"]
try:
node["parameters"]["xpath"] = lowercase_tags_in_xpath(
node["parameters"]["xpath"])
except:
pass
try:
node["parameters"]["waitElementIframeIndex"] = int(
node["parameters"]["waitElementIframeIndex"])
except:
node["parameters"]["waitElement"] = ""
node["parameters"]["waitElementTime"] = 10
node["parameters"]["waitElementIframeIndex"] = 0
if node["option"] == 1: # 打开网页操作
try:
cookies = node["parameters"]["cookies"]
except:
node["parameters"]["cookies"] = ""
elif node["option"] == 2: # 点击操作
try:
alertHandleType = node["parameters"]["alertHandleType"]
except:
node["parameters"]["alertHandleType"] = 0
if node["parameters"]["useLoop"]:
parameters["iframe"] = False if not iframe else parameters.get('iframe', False)
if parameters.get("xpath"):
parameters["xpath"] = lowercase_tags_in_xpath(parameters["xpath"])
if parameters.get("waitElementIframeIndex"):
parameters["waitElementIframeIndex"] = int(parameters["waitElementIframeIndex"])
else:
parameters["waitElement"] = ""
parameters["waitElementTime"] = 10
parameters["waitElementIframeIndex"] = 0
if option == GraphOption.Get.value: # 打开网页操作
parameters["cookies"] = parameters.get("cookies", "")
elif option == GraphOption.Click.value: # 点击操作
parameters["alertHandleType"] = parameters.get("alertHandleType", 0)
if parameters.get("useLoop"):
if self.task_version <= "0.3.5":
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
node["parameters"]["xpath"] = ""
self.print_and_log("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif node["option"] == 3: # 提取数据操作
node["parameters"]["recordASField"] = 0
try:
params = node["parameters"]["params"]
except:
node["parameters"]["params"] = node["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
params = node["parameters"]["params"]
try:
clear = node["parameters"]["clear"]
except:
node["parameters"]["clear"] = 0
try:
newLine = node["parameters"]["newLine"]
except:
node["parameters"]["newLine"] = 1
parameters["xpath"] = ""
self.print_and_log(f"您的任务版本号为{self.task_version}循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif option == GraphOption.Extract.value: # 提取数据操作
parameters["recordASField"] = 0
parameters["params"] = parameters.get("params", parameters.get("paras")) # 兼容0.5.0及以下版本的EasySpider
parameters["clear"] = parameters.get("clear", 0)
parameters["newLine"] = parameters.get("newLine", 1)
params = parameters["params"]
for param in params:
try:
iframe = param["iframe"]
except:
param["iframe"] = False
try:
param["iframe"] = param.get("iframe", False)
if param.get("relativeXPath"):
param["relativeXPath"] = lowercase_tags_in_xpath(param["relativeXPath"])
except:
pass
try:
node["parameters"]["recordASField"] = param["recordASField"]
except:
node["parameters"]["recordASField"] = 1
try:
splitLine = int(param["splitLine"])
except:
param["splitLine"] = 0
if param["contentType"] == 8:
self.print_and_log(
"默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片保存下来然后用自定义操作调用自己写的程序程序的功能是读取这个最新生成的图片然后用好用的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。")
self.print_and_log(
"If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
parameters["recordASField"] = param.get("recordASField", 1)
param["splitLine"] = 0 if not param.get("splitLine") else param.get("splitLine")
if param.get("contentType") == 8:
self.print_and_log("默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType =="
"8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片"
"保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用"
"的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。")
self.print_and_log("If you think the default ddddocr function is not good enough, you can "
"modify the source code get_content function -> contentType == 8 position "
"to your own OCR model and then compile and run it; or you can first set "
"the content type of the crawler to \"Element Screenshot\" to save the "
"picture, and then call your own program with custom operations. The "
"function of the program is to read the latest generated picture, then use "
"a good model, such as PaddleOCR to recognize the picture, and then return "
"the return value as a parameter output to the program.")
param["optimizable"] = detect_optimizable(param)
elif node["option"] == 4: # 输入文字
try:
index = node["parameters"]["index"] # 索引值
except:
node["parameters"]["index"] = 0
elif node["option"] == 5: # 自定义操作
try:
clear = node["parameters"]["clear"]
except:
node["parameters"]["clear"] = 0
try:
newLine = node["parameters"]["newLine"]
except:
node["parameters"]["newLine"] = 1
elif node["option"] == 7: # 移动到元素
if node["parameters"]["useLoop"]:
if self.task_version <= "0.3.5":
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
node["parameters"]["xpath"] = ""
self.print_and_log("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif node["option"] == 8: # 循环操作
try:
exitElement = node["parameters"]["exitElement"]
if exitElement == "":
node["parameters"]["exitElement"] = "//body"
except:
node["parameters"]["exitElement"] = "//body"
node["parameters"]["quickExtractable"] = False # 是否可以快速提取
try:
skipCount = node["parameters"]["skipCount"]
except:
node["parameters"]["skipCount"] = 0
elif option == GraphOption.Input.value: # 输入文字
parameters['index'] = parameters.get('index', 0)
elif option == GraphOption.Custom.value: # 自定义操作
parameters['clear'] = parameters.get('clear', 0)
parameters['newLine'] = parameters.get('newLine', 1)
elif option == GraphOption.Move.value: # 移动到元素
if parameters.get('useLoop'):
if self.task_version <= "0.3.5": # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
parameters["xpath"] = ""
self.print_and_log(f"您的任务版本号为{self.task_version}循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif option == GraphOption.Loop.value: # 循环操作
parameters['exitElement'] = "//body" if not parameters.get('exitElement') or parameters.get('exitElement') == "" else parameters.get('exitElement')
parameters["quickExtractable"] = False # 是否可以快速提取
parameters['skipCount'] = parameters.get('skipCount', 0)
# 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
try:
params = self.procedure[node["sequence"][0]]["parameters"]["params"]
except:
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
try:
waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"]
except:
waitElement = ""
if node["parameters"]["iframe"]:
node["parameters"]["quickExtractable"] = False # 如果是iframe那么不可以快速提取
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 \
and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
params = self.procedure[node["sequence"][0]].get("parameters").get("params")
if not params:
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
waitElement = self.procedure[node["sequence"][0]]["parameters"].get("waitElement", "")
if parameters["iframe"]:
parameters["quickExtractable"] = False # 如果是iframe那么不可以快速提取
else:
node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
if node["parameters"]["skipCount"] > 0:
node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
parameters["quickExtractable"] = True # 先假设可以快速提取
if parameters["skipCount"] > 0:
parameters["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
for param in params:
optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
try:
iframe = param["iframe"]
except:
param["iframe"] = False
if param["iframe"] and not param["relative"]: # 如果是iframe那么不可以快速提取
param['iframe'] = param.get('iframe', False)
if param["iframe"] and not param["relative"]: # 如果是iframe那么不可以快速提取
optimizable = False
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
node["parameters"]["quickExtractable"] = False
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
parameters["quickExtractable"] = False
break
if node["parameters"]["quickExtractable"]:
self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据")
self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly")
try:
node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"]
except:
node["parameters"]["clear"] = 0
try:
node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"]
except:
node["parameters"]["newLine"] = 1
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
if parameters["quickExtractable"]:
self.print_and_log(f"循环操作<{node['title']}>可以快速提取数据")
self.print_and_log(f"Loop operation <{node['title']}> can extract data quickly")
parameters["clear"] = self.procedure[node["sequence"][0]]["parameters"].get("clear", 0)
parameters["newLine"] = self.procedure[node["sequence"][0]]["parameters"].get("newLine", 1)
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
node["parameters"]["baseXPath"] = node["parameters"]["xpath"]
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
node["parameters"]["baseXPath"] = node["parameters"]["pathList"]
node["parameters"]["quickParams"] = []
for param in params:
content_type = ""
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 or param["relativeXPath"].find(
"::text()") >= 0:
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 \
or param["relativeXPath"].find("::text()") >= 0:
content_type = ""
elif param["nodeType"] == 2:
content_type = "//@href"
elif param["nodeType"] == 4: # 图片链接
elif param["nodeType"] == 4: # 图片链接
content_type = "//@src"
elif param["contentType"] == 1:
content_type = "/text()"
elif param["contentType"] == 0:
content_type = "//text()"
if param["relative"]: # 如果是相对XPath
if param["relative"]: # 如果是相对XPath
xpath = "." + param["relativeXPath"] + content_type
else:
xpath = param["relativeXPath"] + content_type
@ -443,6 +371,7 @@ class BrowserThread(Thread):
"nodeType": param["nodeType"],
"default": param["default"],
})
self.procedure[index_node]["parameters"] = parameters
self.print_and_log("预处理完成|Preprocess completed")
def readFromExcel(self):
@ -559,7 +488,10 @@ class BrowserThread(Thread):
self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
time.sleep(quitWaitTime)
self.browser.quit()
try:
self.browser.quit()
except:
pass
self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
try:
shutil.rmtree(self.option["tmp_user_data_folder"])
@ -775,18 +707,20 @@ class BrowserThread(Thread):
self.browser.set_script_timeout(max_wait_time)
try:
output = self.browser.execute_script(code)
except:
except Exception as e:
output = ""
self.recordLog("JavaScript execution failed")
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
elif int(codeMode) == 2:
self.recordLog("Execute JavaScript for element:" + code)
self.recordLog("对元素执行JavaScript:" + code)
self.browser.set_script_timeout(max_wait_time)
try:
output = self.browser.execute_script(code, element)
except:
except Exception as e:
output = ""
self.recordLog("JavaScript execution failed")
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
elif int(codeMode) == 5:
try:
code = readCode(code)
@ -796,9 +730,9 @@ class BrowserThread(Thread):
self.recordLog("执行下面的代码:" + code)
self.recordLog("Execute the following code:" + code)
except Exception as e:
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", e)
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
self.print_and_log("Error executing the following code:" +
code, ", error is:", e)
code, ", error is:", str(e))
elif int(codeMode) == 6:
try:
code = readCode(code)
@ -1204,7 +1138,7 @@ class BrowserThread(Thread):
return index, element
# 对循环的处理
def loopExecute(self, node, loopValue, clickPath="", index=0):
def loopExecute(self, node, loopValue, loopPath="", index=0):
time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID
try:
@ -1216,6 +1150,14 @@ class BrowserThread(Thread):
self.history["handle"] = thisHandle
thisHistoryURL = self.browser.current_url
# 快速提取处理
# start = time.time()
try:
tree = html.fromstring(self.browser.page_source)
except Exception as e:
self.print_and_log("解析页面时出错,将切换普通提取模式|Error parsing page, will switch to normal extraction mode")
node["parameters"]["quickExtractable"] = False
# end = time.time()
# print("解析页面秒数:", end - start)
if node["parameters"]["quickExtractable"]:
self.browser.switch_to.default_content() # 切换到主页面
tree = html.fromstring(self.browser.page_source)
@ -1721,8 +1663,11 @@ class BrowserThread(Thread):
try:
actions = ActionChains(self.browser) # 实例化一个action对象
if newTab == 1: # 在新标签页打开
# Ctrl + Click
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
if sys.platform == "darwin": # Mac
actions.key_down(Keys.COMMAND).click(element).key_up(Keys.COMMAND).perform()
else:
# Ctrl + Click
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
else:
actions.click(element).perform()
except Exception as e:
@ -1925,9 +1870,17 @@ class BrowserThread(Thread):
width = size["width"]
height = size["height"]
# 调整浏览器窗口的大小
self.browser.set_window_size(width, height)
if self.commandline_config["headless"] == 1: # 无头模式下,截取整个网页的高度
page_width = self.browser.execute_script(
"return document.body.scrollWidth")
page_height = self.browser.execute_script(
"return document.body.scrollHeight")
self.browser.set_window_size(page_width, page_height)
time.sleep(1)
else:
self.browser.set_window_size(width, height)
element.screenshot("Data/Task_" + str(self.id) + "/" + self.saveName +
"/" + str(time.time()) + ".png")
"/screenshots/" + str(time.time()) + ".png")
# 截图完成后,将浏览器的窗口大小设置为原来的大小
self.browser.set_window_size(width, height)
elif p["contentType"] == 8:
@ -2238,7 +2191,7 @@ class BrowserThread(Thread):
if __name__ == '__main__':
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
config = {
commandline_config = {
"ids": [0],
"saved_file_name": "",
"user_data": False,
@ -2249,9 +2202,11 @@ if __name__ == '__main__':
"server_address": "http://localhost:8074",
"keyboard": True, # 是否监听键盘输入
"pause_key": "p", # 暂停键
"version": "0.6.2",
"version": "0.6.3",
"docker_driver": "",
"user_folder": "",
}
c = Config(config)
c = Config(commandline_config)
print(c)
options = webdriver.ChromeOptions()
driver_path = "chromedriver.exe"
@ -2345,35 +2300,43 @@ if __name__ == '__main__':
os.mkdir(tmp_user_folder_parent)
characters = string.ascii_letters + string.digits
for i in range(len(c.ids)):
id = c.ids[i]
# 从字符集中随机选择字符构成字符串
random_string = ''.join(random.choice(characters) for i in range(10))
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
if os.path.exists(tmp_user_data_folder):
try:
shutil.rmtree(tmp_user_data_folder)
except:
pass
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
if os.path.exists(absolute_user_data_folder):
try:
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
except:
tmp_user_data_folder = absolute_user_data_folder
print("Copy user data folder failed, use the original folder.")
print("复制用户信息目录失败,使用原始目录。")
else:
tmp_user_data_folder = absolute_user_data_folder
print("Cannot find user data folder, create a new folder.")
print("未找到用户信息目录,创建新目录。")
options = tmp_options[i]["options"]
options.add_argument(
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
options.add_argument("--profile-directory=Default")
if c.user_folder == "":
id = c.ids[i]
# 从字符集中随机选择字符构成字符串
random_string = ''.join(random.choice(characters) for i in range(10))
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
if os.path.exists(tmp_user_data_folder):
try:
shutil.rmtree(tmp_user_data_folder)
except:
pass
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
if os.path.exists(absolute_user_data_folder):
try:
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
except:
tmp_user_data_folder = absolute_user_data_folder
print("Copy user data folder failed, use the original folder.")
print("复制用户信息目录失败,使用原始目录。")
else:
tmp_user_data_folder = absolute_user_data_folder
print("Cannot find user data folder, create a new folder.")
print("未找到用户信息目录,创建新目录。")
options.add_argument(
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
print(f"Use local user data folder: {tmp_user_data_folder}")
print(f"使用本地用户信息目录: {tmp_user_data_folder}")
else:
options.add_argument(
f'--user-data-dir={c.user_folder}')
print(f"Use specifed user data folder: {c.user_folder}, please note if you are using docker, this user folder path should be the path inside the docker container.")
print(f"使用指定的用户信息目录: {c.user_folder}请注意如果您正在使用docker此用户文件夹路径应是容器内的路径。")
print(
"如果报错Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally说明有之前运行的Chrome实例没有正常关闭请关闭之前打开的所有Chrome实例后再运行程序即可。")
print(
@ -2386,9 +2349,13 @@ if __name__ == '__main__':
print("id: ", id)
if c.read_type == "remote":
print("remote")
content = requests.get(
try:
content = requests.get(
c.server_address + "/queryExecutionInstance?id=" + str(id))
service = json.loads(content.text) # 加载服务信息
service = json.loads(content.text) # 加载服务信息
except:
print("Cannot connect to the server, please make sure that the EasySpider Main Program is running, or you can change the --read_type parameter to 'local' to read the task information from the local task file without keeping the EasySpider Main Program running.")
print("无法连接到服务器请确保EasySpider主程序正在运行或者您可以将--read_type参数更改为'local'以实现从本地任务文件中读取任务信息而无需保持EasySpider主程序运行。")
else:
print("local")
local_folder = os.path.join(os.getcwd(), "execution_instances")
@ -2439,8 +2406,17 @@ if __name__ == '__main__':
except:
browser = "chrome"
if browser == "chrome":
selenium_service = Service(executable_path=driver_path)
browser_t = MyChrome(service=selenium_service, options=options)
if c.docker_driver == "":
print("Using local driver")
selenium_service = Service(executable_path=driver_path)
browser_t = MyChrome(service=selenium_service, options=options, mode='local_driver')
else:
print("Using remote driver")
# Use docker driver, default address is http://localhost:4444/wd/hub
# Headless mode
# options.add_argument("--headless")
# print("Headless mode")
browser_t = MyChrome(command_executor=c.docker_driver, options=options, mode='remote_driver')
elif browser == "edge":
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
@ -2472,7 +2448,7 @@ if __name__ == '__main__':
event = Event()
event.set()
thread = BrowserThread(browser_t, id, service,
c.version, event, c.saved_file_name, config=config, option=tmp_options[i])
c.version, event, c.saved_file_name, config=config, option=tmp_options[i], commandline_config=c)
print("Thread with task id: ", id, " is created")
threads.append(thread)
thread.start()
@ -2501,6 +2477,7 @@ if __name__ == '__main__':
# print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
# 使用监听器监听键盘输入
try:
from pynput.keyboard import Key, Listener
if c.keyboard:
with Listener(on_press=on_press_creator(press_time, event),
on_release=on_release_creator(event, press_time)) as listener:

108
ExecuteStage/fl_beta.py Normal file
View File

@ -0,0 +1,108 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
import numpy as np
from PIL import Image
import os
# 定义 ResNet 模型(以 ResNet18 为例)
class ResNetModel(nn.Module):
def __init__(self, num_classes):
super(ResNetModel, self).__init__()
self.resnet = models.resnet18(pretrained=True)
# 修改最后的全连接层以适应特定的分类任务
self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
def forward(self, x):
return self.resnet(x)
# 自定义数据集类
class WebpageDataset(Dataset):
def __init__(self, image_dir, transform=None):
self.image_dir = image_dir
self.transform = transform
self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.png')]
def __len__(self):
return len(self.image_files)
def __getitem__(self, idx):
img_name = os.path.join(self.image_dir, self.image_files[idx])
image = Image.open(img_name).convert('RGB')
label = self.get_label_from_filename(self.image_files[idx])
if self.transform:
image = self.transform(image)
return image, label
def get_label_from_filename(self, filename):
# 假设文件名格式为 'class_label.png'
return int(filename.split('_')[0])
# 图像预处理
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
# 定义客户端训练函数
def train_local_model(model, dataloader, criterion, optimizer, epochs=5):
model.train()
for epoch in range(epochs):
for images, labels in dataloader:
outputs = model(images)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
return model.state_dict()
# 联邦平均算法
def federated_average(models_state_dicts):
avg_state_dict = models_state_dicts[0]
for key in avg_state_dict.keys():
for i in range(1, len(models_state_dicts)):
avg_state_dict[key] += models_state_dicts[i][key]
avg_state_dict[key] = torch.div(avg_state_dict[key], len(models_state_dicts))
return avg_state_dict
# 模拟多个客户端的数据
client_data_dirs = ['client1_data', 'client2_data', 'client3_data'] # 每个客户端的数据目录
num_classes = 10 # 根据实际情况设置
# 初始化全局模型
global_model = ResNetModel(num_classes=num_classes)
# 定义损失函数
criterion = nn.CrossEntropyLoss()
# 联邦学习过程
num_rounds = 10
for round in range(num_rounds):
local_models = []
for client_dir in client_data_dirs:
# 加载客户端数据
dataset = WebpageDataset(image_dir=client_dir, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# 初始化客户端模型
local_model = ResNetModel(num_classes=num_classes)
local_model.load_state_dict(global_model.state_dict())
# 定义优化器
optimizer = optim.SGD(local_model.parameters(), lr=0.01, momentum=0.9)
# 训练本地模型
local_state_dict = train_local_model(local_model, dataloader, criterion, optimizer)
local_models.append(local_state_dict)
# 聚合模型参数
global_state_dict = federated_average(local_models)
global_model.load_state_dict(global_state_dict)
print(f'Round {round+1}/{num_rounds} completed.')
# 保存全局模型
torch.save(global_model.state_dict(), 'federated_resnet_model.pth')

View File

@ -1,5 +1,10 @@
#!/bin/bash
rm -r build
rm -r dist
# 一定要先source一下不然会出现找不到conda命令的错误
source ~/miniconda3/etc/profile.d/conda.sh
conda activate easyspider
# Python一定要是3.11版本不然会出现浏览器弹出崩溃的错误原来使用的3.8,崩溃原因未知。
pyinstaller -F --add-data "/home/naibo/miniconda3/envs/easyspider/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.cpython-311-x86_64-linux-gnu.so:onnxruntime/capi" --add-data "/home/naibo/miniconda3/envs/easyspider/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py
rm ../ElectronJS/chrome_linux64/easyspider_executestage

36
ExecuteStage/llm_beta.py Normal file
View File

@ -0,0 +1,36 @@
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import torch
# 加载 Llama 3.2 视觉模型和处理器
model_name = "meta-llama/Llama-3.2-11B-Vision" # 请根据实际模型路径替换
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(model_name)
# 处理网页截图并提取结构
def predict_structure_from_image(image_path):
# 加载图像
image = Image.open(image_path).convert("RGB")
# 预处理图像
inputs = processor(images=image, return_tensors="pt")
# 生成描述(结构描述)
outputs = model.generate(
inputs["pixel_values"],
max_length=512,
num_beams=5,
early_stopping=True
)
description = processor.decode(outputs[0], skip_special_tokens=True)
return description
# 示例使用
if __name__ == "__main__":
# 提供网页截图的路径
image_path = "webpage_screenshot.png" # 请替换为实际的图像文件路径
# 预测结构
predicted_structure = predict_structure_from_image(image_path)
print("预测的结构:", predicted_structure)

View File

@ -19,11 +19,16 @@ desired_capabilities["pageLoadStrategy"] = "none"
class MyChrome(webdriver.Chrome):
class MyChrome(webdriver.Chrome, webdriver.Remote):
def __init__(self, *args, **kwargs):
def __init__(self, mode='local_driver', *args, **kwargs):
self.iframe_env = False # 现在的环境是root还是iframe
super().__init__(*args, **kwargs) # 调用父类的 __init__
self.mode = mode
if mode == "local_driver":
webdriver.Chrome.__init__(self, *args, **kwargs)
elif mode == "remote_driver":
webdriver.Remote.__init__(self, *args, **kwargs)
# super().__init__(*args, **kwargs) # 调用父类的 __init__
# def find_element(self, by=By.ID, value=None, iframe=False):
# # 在这里改变查找元素的行为

View File

@ -1,14 +1,14 @@
commandline_config==2.2.3
requests==2.31.0
selenium==4.16.0
requests==2.32.3
selenium==4.27.1
pyinstaller==5.13.2
Pillow==10.2.0
xlsxwriter==3.1.9
xlsxwriter==3.2.0
openpyxl==3.1.2
pymysql==1.1.0
lxml==4.9.2
ddddocr==1.4.10
pymysql==1.1.1
lxml==5.3.0
ddddocr==1.5.6
pynput==1.7.6
beautifulsoup4==4.12.2
undetected-chromedriver==3.4.7
pandas==2.1.4
pandas==2.2.3

View File

@ -1,11 +1,11 @@
commandline_config==2.2.3
requests==2.31.0
requests==2.32.0
selenium==4.16.0
pyinstaller==5.13.2
Pillow==9.5.0
xlsxwriter==3.1.9
openpyxl==3.1.2
pymysql==1.1.0
pymysql==1.1.1
lxml==4.9.2
ddddocr==1.4.10
pynput==1.7.6

View File

@ -1,4 +1,8 @@
# 环境编译说明|Environment Compilation Instruction
## 视频教程
[从源代码编译程序并设计运行和调试任务指南基于Ubuntu24.04](https://www.bilibili.com/video/BV1VE421P7yj/)
# 环境编译说明 | Environment Compilation Instruction
EasySpider分三部分
@ -18,20 +22,20 @@ EasySpider is divided into three parts:
This section covers the compilation instructions for the `Browser extension`, **all commands in this section are executed in the `manifest_v3` folder**, i.e., you need to `cd manifest_v3` first.
## 建议编译顺序|Suggested Compilation Order
## 建议编译顺序 | Suggested Compilation Order
1. 编译浏览器扩展,否则在主程序执行时会提示找不到`EasySpider_zh.crx`的错误。
2. 编译主程序,此时主程序可以正常运行,但无法执行任务,只能设计任务。
3. 编译执行阶段程序,否则无法执行程序,只能设计程序
3. 编译执行阶段程序,否则无法执行任务,只能设计任务
-----
1. Compile the browser extension, otherwise an error will be prompted when the main program is executed that `EasySpider_en.crx` cannot be found.
2. Compile the main program, at this time the main program can run normally, but can not execute the task, can only design the task.
3. Compile the execution stage program, otherwise the program cannot be executed, can only design the program.
3. Compile the execution stage program, otherwise the task cannot be executed, can only design the task.
## 环境构建|Environment Setup
## 环境构建 | Environment Setup
1. 安装`NodeJS`[https://nodejs.org/zh-cn/download/](https://nodejs.org/zh-cn/download/)。
2. 运行下面的命令来安装依赖:
@ -49,7 +53,7 @@ npm install
npm install
```
## 热加载扩展|Hot reload the extension
## 热加载扩展 | Hot reload the extension
执行下面的命令来热加载扩展:
@ -69,7 +73,7 @@ npm run dev
Open a Chrome browser window, then enter `chrome://extensions/` in the browser address bar. On the opened page, open the `Developer mode` in the upper right corner, click `Load unpacked` and select the `manifest_v3/dist` folder to load the extension.
## 打包扩展|Package the extension
## 打包扩展 | Package the extension
执行下面的命令来打包扩展:

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
{
"name": "EasySpider",
"version": "0.6.2",
"version": "0.6.3",
"type": "module",
"scripts": {
"build": "rollup -c",
@ -34,7 +34,7 @@
"@types/node": "^16.11.10",
"@vitejs/plugin-vue": "^1.9.3",
"esno": "^0.12.1",
"firebase": "^9.18.0",
"firebase": "^10.12.2",
"fs-extra": "^10.0.0",
"npm-run-all": "^4.1.5",
"rimraf": "^3.0.2",

View File

@ -1,5 +1,6 @@
import config from './config.json';
export var global = {
nodeList: [], //已被选中的节点列表
readyList: [], //预备选中的list
@ -57,13 +58,13 @@ export function getElementXPaths(element, parentElement = document.body) {
paths.push(pre_xpath + `id("${element.id}")`);
}
if (element.className) {
paths.push(pre_xpath + "//" + element.tagName + "[@class='" + element.className + "']");
paths.push(pre_xpath + "//" + element.tagName.toLowerCase() + "[@class='" + element.className + "']");
}
if (element.name) {
paths.push(pre_xpath + "//" + element.tagName + "[@name='" + element.name + "']");
paths.push(pre_xpath + "//" + element.tagName.toLowerCase() + "[@name='" + element.name + "']");
}
if (element.alt) {
paths.push(pre_xpath + "//" + element.tagName + "[@alt='" + element.alt + "']");
paths.push(pre_xpath + "//" + element.tagName.toLowerCase() + "[@alt='" + element.alt + "']");
}
paths.push(getAbsoluteXPathWithReverseIndex(element));
console.log("ALL PATHS: " + paths);

View File

@ -1,9 +1,28 @@
import $ from "jquery";
import Vue from "vue";
import {global, getOS, readXPath, addEl, clearEl, clearReady, handleElement, clearParameters, generateParameters, generateMultiParameters, handleDescendents, generateValTable, findRelated, pushToReadyList, readyToList, combineXpath, relatedTest} from "./global.js";
import {
global,
getOS,
readXPath,
addEl,
clearEl,
clearReady,
handleElement,
clearParameters,
generateParameters,
generateMultiParameters,
handleDescendents,
generateValTable,
findRelated,
pushToReadyList,
readyToList,
combineXpath,
relatedTest,
LANG
} from "./global.js";
import ToolKit from "./toolkit.vue";
import iframe from "./iframe.vue";
import {createNotification} from './trail.js';
//表现逻辑层的处理
@ -170,7 +189,7 @@ window.addEventListener('DOMContentLoaded', () => {
document.onkeydown = function(event) {
// console.log("keydown");
var e = event || window.event || arguments.callee.caller.arguments[0];
if (e && e.keyCode == 118) { // 按 F7
if (e && e.keyCode == 113) { // 按 F2
addEl();
} else if (e && e.keyCode == 119) { //按F8
clearEl();
@ -316,11 +335,16 @@ function generateToolkit() {
//Vue元素
generateToolkit();
function closeToolkit() {
toolkit.style.display = "none"; // 隐藏元素
createNotification(LANG("EasySpider操作控制台已隐藏可点击浏览器右上角扩展程序区域的EasySpider图标重新显示。", "EasySpider Toolkit is hidden. Click the EasySpider icon in the extension list (upper right corner) of the browser to reopen."));
}
let closeButton = document.getElementById("closeButton");
closeButton.addEventListener("click", function() {
toolkit.style.display = "none"; // 隐藏元素
closeToolkit();
});
let closeButtonLeft = document.getElementById("closeButtonLeft");
closeButtonLeft.addEventListener("click", function() {
toolkit.style.display = "none"; // 隐藏元素
closeToolkit();
});

View File

@ -27,6 +27,10 @@ global.ws.onmessage = function (evt) {
clearEl();
} else if (evt["type"] == "trial") {
trial(evt);
} else if (evt["type"] == "showAllToolboxes") {
document.getElementById("wrapperToolkit").style.display = "block";
} else if (evt["type"] == "hideAllToolboxes") {
document.getElementById("wrapperToolkit").style.display = "none";
}
};

View File

@ -24,7 +24,7 @@
</div>
<p style="color:black; margin-top: 10px"> 鼠标移动到笑脸<span style="font-size: 20px"></span>查看操作提示</p>
<p style="color:black; margin-top: 10px">
鼠标移动到元素上后<strong>右键</strong>点击或者按<strong>F7</strong>键选中页面元素
鼠标移动到元素上后<strong>右键</strong>点击或者按<strong>F2</strong>键选中页面元素
</p>
<p style="color:black; margin-top: 10px">
通过鼠标左键进行点击时页面也会有反应但左键点击发生的操作不会被记录在任务流程中同理如果想输入文本框但并不想将动作记录可以鼠标移动到文本框并按键盘的<strong>F9</strong>进行输入
@ -231,7 +231,7 @@
<p style="color:black; margin-top: 10px"> Mouse move to smiling face <span style="font-size: 20px"></span> to see operation help.</p>
<p style="color:black; margin-top: 10px"> When your mouse moves to the element, please
<strong>right-click</strong> your
mouse button or press <strong>F7</strong> on the keyboard to select it.</p>
mouse button or press <strong>F2</strong> on the keyboard to select it.</p>
<p style="color:black; margin-top: 10px"> When clicked with the left mouse button, the page will also
respond, but this click operation will not be recorded in the task flow. Similarly, if you want to input
in a text box but do not want the action to be recorded , you can move the mouse to the text box and

View File

@ -1,6 +1,6 @@
{
"name": "EasySpider",
"version": "0.6.2",
"version": "0.6.3",
"description": "EasySpider's chrome extension",
"author": "Naibo Wang",
"manifest_version": 3,
@ -11,6 +11,7 @@
"38": "assets/icon-38.png",
"128": "assets/icon-128.png"
},
"default_popup": "popup.html",
"default_title": "EasySpider"
},
"icons": {
@ -53,6 +54,7 @@
"storage",
"tabs",
"scripting",
"activeTab",
"notifications"
]
}

View File

@ -1,11 +1,19 @@
<!doctype html>
<!DOCTYPE html>
<html>
<head>
<title>Popup 示例</title>
<link rel="stylesheet" type="text/css" href="popup.css">
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>EasySpider Control Panel</title>
<link rel="stylesheet" href="style/bootstrap.min.css">
</head>
<body>
<!-- <h2>EasySpider Extension</h2>-->
EasySpider Extension, please do not disable me.
<body class="p-4">
<div class="text-center">
<!-- <h3>操作</h3>-->
<p id="title">可执行操作</p>
<button id="show-toolkit" class="btn btn-primary" style="width: 200px">显示EasySpider操作台</button>
<p></p>
<button id="close-toolkit" class="btn btn-danger" style="width: 200px">隐藏EasySpider操作台</button>
</div>
<script src="popup.js"></script>
</body>
</html>

View File

@ -1,3 +1,141 @@
document.getElementById('clickme').addEventListener('click', () => {
alert('Hello, World!');
import config from './content-scripts/config.json';
import {global} from "./content-scripts/global.js";
if (config.language == 'zh') {
document.getElementById('title').innerText = '可执行操作';
document.getElementById('show-toolkit').innerText = '显示EasySpider操作台';
document.getElementById('close-toolkit').innerText = '隐藏EasySpider操作台';
} else {
document.getElementById('title').innerText = 'Executable Operations';
document.getElementById('show-toolkit').innerText = 'Show EasySpider Toolkit';
document.getElementById('close-toolkit').innerText = 'Hide EasySpider Toolkit';
}
var ws = new WebSocket("ws://localhost:8084");
ws.onopen = function () {
// Web Socket 已连接上,使用 send() 方法发送数据
console.log("已连接");
let message = {
type: 0, //消息类型0代表连接操作
message: {
id: 3, //socket id
title: document.title, //网页标题
}
};
this.send(JSON.stringify(message));
};
document.getElementById('show-toolkit').addEventListener('click', async () => {
try {
// 发送消息给 content script
const [tab] = await chrome.tabs.query({active: true, currentWindow: true});
chrome.scripting.executeScript({
target: {tabId: tab.id},
func: showToolkit
});
let message_action = {
type: 30, //消息类型30代表显示所有操作台
from: 3, //3代表popup
message: {}
};
window.close();
try {
ws.send(JSON.stringify(message_action));
} catch (e) {
}
} catch (error) {
console.error('Error showing toolkit:', error);
}
});
document.getElementById('close-toolkit').addEventListener('click', async () => {
try {
// 发送消息给 content script
const [tab] = await chrome.tabs.query({active: true, currentWindow: true});
chrome.scripting.executeScript({
target: {tabId: tab.id},
func: closeToolkit
});
let message_action = {
type: 31, //消息类型30代表隐藏所有操作台
from: 3, //3代表popup
message: {}
};
window.close();
try {
ws.send(JSON.stringify(message_action));
} catch (e) {
}
} catch (error) {
console.error('Error closing toolkit:', error);
}
});
// 显示操作台函数
function showToolkit() {
const showContainers = (documentRoot) => {
const containers = documentRoot.querySelectorAll('#wrapperToolkit');
containers.forEach(container => {
if (getComputedStyle(container).display === 'none') {
container.style.display = 'block';
console.log('显示EasySpider操作台');
}
});
};
const processIframes = (documentRoot) => {
const iframes = documentRoot.querySelectorAll('iframe');
iframes.forEach(iframe => {
try {
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
if (iframeDoc) {
// 显示 iframe 内的 #wrapperToolkit
showContainers(iframeDoc);
processIframes(iframeDoc);
}
} catch (err) {
console.warn('无法访问 iframe:', err);
}
});
};
// 处理主文档和嵌套 iframe
showContainers(document);
processIframes(document);
}
// 关闭操作台函数
function closeToolkit() {
const hideContainers = (documentRoot) => {
const containers = documentRoot.querySelectorAll('#wrapperToolkit');
containers.forEach(container => {
if (getComputedStyle(container).display === 'block') {
container.style.display = 'none';
console.log('关闭EasySpider操作台');
}
});
};
const processIframes = (documentRoot) => {
const iframes = documentRoot.querySelectorAll('iframe');
console.log("iframes", iframes);
iframes.forEach(iframe => {
try {
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
if (iframeDoc) {
// 隐藏 iframe 内的 #wrapperToolkit
hideContainers(iframeDoc);
processIframes(iframeDoc);
}
} catch (err) {
console.warn('无法访问 iframe:', err);
}
});
};
// 处理主文档和嵌套 iframe
hideContainers(document);
processIframes(document);
}

File diff suppressed because one or more lines are too long

1098
LICENSE

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More