mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-20 03:59:57 +08:00
gooood
This commit is contained in:
parent
e9c65585cb
commit
bde9b2fe55
13
gooood/README.md
Normal file
13
gooood/README.md
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# 案例
|
||||||
|
此网站爬虫部分简单,清洗逻辑繁琐一点正文内容以后台生成html展示。
|
||||||
|
|
||||||
|
**项目结构**
|
||||||
|
|
||||||
|
1. docs是存储结果目录
|
||||||
|
2. spider是爬虫文件目录
|
||||||
|
3. extrator是解析文件目录
|
||||||
|
|
||||||
|
|
||||||
|
**几个问题点**
|
||||||
|
1. 这只是个简单案例,此项目没有测试用例,无法保证代码长期正常运行
|
||||||
|
2. 项目信息2逻辑比较琐碎,时间有限,暂不处理
|
0
gooood/dao/__init__.py
Normal file
0
gooood/dao/__init__.py
Normal file
34
gooood/dao/file_dao.py
Normal file
34
gooood/dao/file_dao.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class File(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def write_json(self, filename, data_list):
|
||||||
|
with open(f'../docs/{filename}.json', 'w') as f:
|
||||||
|
for item in data_list:
|
||||||
|
json.dump(item, f)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
def write(self, filename, data_list, columns_list):
|
||||||
|
filename = f'../docs/{filename}.csv'
|
||||||
|
try:
|
||||||
|
frame = pd.DataFrame(
|
||||||
|
data_list,
|
||||||
|
columns=columns_list
|
||||||
|
)
|
||||||
|
frame.to_csv(filename)
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
return True
|
||||||
|
|
||||||
|
def read(self, filename):
|
||||||
|
try:
|
||||||
|
filename = f'../docs/{filename}.csv'
|
||||||
|
frame = pd.read_csv(filename)
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
return frame
|
60
gooood/docs/结果表.json
Normal file
60
gooood/docs/结果表.json
Normal file
File diff suppressed because one or more lines are too long
0
gooood/extrator/__init__.py
Normal file
0
gooood/extrator/__init__.py
Normal file
64
gooood/extrator/base.py
Normal file
64
gooood/extrator/base.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
from setting import PLACEHOLDER
|
||||||
|
from w3lib.html import remove_tags
|
||||||
|
|
||||||
|
|
||||||
|
class BaseExtractor(object):
|
||||||
|
|
||||||
|
def parse_paragraph(self, selector, content_list, tag='p'):
|
||||||
|
"""
|
||||||
|
arguemnts:
|
||||||
|
selector: scrapy.Selector.
|
||||||
|
content_list: article content list.
|
||||||
|
tag: paragraph tag.
|
||||||
|
"""
|
||||||
|
text = ''.join(selector.css(tag).xpath(
|
||||||
|
'node()').extract()).strip()
|
||||||
|
if text:
|
||||||
|
text = remove_tags(
|
||||||
|
text, which_ones=('a', 'meta', 'script', 'span'))
|
||||||
|
content_list.append(text)
|
||||||
|
|
||||||
|
def parse_image(self, **kwargs):
|
||||||
|
'''
|
||||||
|
arguemnts:
|
||||||
|
selector: scrapy.Selector.
|
||||||
|
src: image src css path.
|
||||||
|
desc: description css path, if desc_path doesn't
|
||||||
|
exist, then description is a empty string.
|
||||||
|
content: article content list.
|
||||||
|
resource: article resource list.
|
||||||
|
count: image counter.
|
||||||
|
'''
|
||||||
|
kwargs['type'] = 'image'
|
||||||
|
return self.parse_resource(**kwargs)
|
||||||
|
|
||||||
|
def parse_resource(self, **kwargs):
|
||||||
|
placeholder = PLACEHOLDER[kwargs['type']]
|
||||||
|
count = kwargs['count']
|
||||||
|
content_list = kwargs['content']
|
||||||
|
resource_list = kwargs['resource']
|
||||||
|
sel = kwargs['selector']
|
||||||
|
src = '@src'
|
||||||
|
if 'src_attr' in kwargs:
|
||||||
|
src = kwargs['src_attr']
|
||||||
|
src = sel.css(kwargs['src']).xpath(src).extract()
|
||||||
|
desc = ''
|
||||||
|
if 'desc' in kwargs:
|
||||||
|
desc = ''.join(sel.css(kwargs['desc']).extract()).strip()
|
||||||
|
if len(src) > 1:
|
||||||
|
for s in src:
|
||||||
|
ref = placeholder.format(count)
|
||||||
|
res = {'src': s, 'desc': desc, 'ref': ref}
|
||||||
|
resource_list.append(res)
|
||||||
|
content_list.append(ref)
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
src = src[0]
|
||||||
|
if not src:
|
||||||
|
return count
|
||||||
|
ref = placeholder.format(count)
|
||||||
|
count += 1
|
||||||
|
res = {'ref': ref, 'src': src, 'desc': desc}
|
||||||
|
resource_list.append(res)
|
||||||
|
content_list.append(res['ref'])
|
||||||
|
return count
|
105
gooood/extrator/clean_detail.py
Normal file
105
gooood/extrator/clean_detail.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
from base import BaseExtractor
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import scrapy
|
||||||
|
from dao.file_dao import File
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
|
class CleanDetail(BaseExtractor):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.file = File()
|
||||||
|
|
||||||
|
def clean(self):
|
||||||
|
df = self.file.read('raw_gd_detail')
|
||||||
|
list_res = []
|
||||||
|
for response_text in df["response_text"].values:
|
||||||
|
selector = etree.HTML(response_text)
|
||||||
|
title = selector.xpath('//h1/text()')[0]
|
||||||
|
subtitle = selector.xpath('//div[contains(@class,"single-content")]/h2/text()')[0]
|
||||||
|
url = selector.xpath('//meta[@property="og:url"]/@content')[0]
|
||||||
|
|
||||||
|
design_company_str = selector.xpath(
|
||||||
|
'//span[contains(text(), "设计公司")]/../following-sibling::div[1]/a/text()')
|
||||||
|
if design_company_str:
|
||||||
|
design_company = design_company_str[0]
|
||||||
|
else:
|
||||||
|
design_company = ''
|
||||||
|
|
||||||
|
location_str = selector.xpath('//span[contains(text(), "位置")]/../following-sibling::div[1]/a/text()')
|
||||||
|
if location_str:
|
||||||
|
location = location_str[0]
|
||||||
|
else:
|
||||||
|
location = ''
|
||||||
|
|
||||||
|
type_c_str = selector.xpath('//span[contains(text(), "类型")]/../following-sibling::div[1]/a/text()')
|
||||||
|
if type_c_str:
|
||||||
|
type_c = type_c_str[0]
|
||||||
|
else:
|
||||||
|
type_c = ''
|
||||||
|
|
||||||
|
material_str = selector.xpath('//span[contains(text(), "材料")]/../following-sibling::div[1]/a/text()')
|
||||||
|
if material_str:
|
||||||
|
material = material_str[0]
|
||||||
|
else:
|
||||||
|
material = ''
|
||||||
|
|
||||||
|
label_str = selector.xpath(
|
||||||
|
'//span[contains(text(), "标签")]/../following-sibling::div[1]/a/text()')
|
||||||
|
if label_str:
|
||||||
|
label = label_str
|
||||||
|
else:
|
||||||
|
label = []
|
||||||
|
|
||||||
|
classification_str = selector.xpath(
|
||||||
|
'//span[contains(text(), "分类")]/../following-sibling::div[1]/a/text()')
|
||||||
|
if classification_str:
|
||||||
|
classification = classification_str[0]
|
||||||
|
else:
|
||||||
|
classification = ''
|
||||||
|
|
||||||
|
item = re.findall(r'window.__INITIAL_STATE__=(.*?)</script>', response_text)[0]
|
||||||
|
item_dict = json.loads(item)
|
||||||
|
content_html = item_dict.get('post').get('content')
|
||||||
|
selector = scrapy.Selector(text=content_html, type='html')
|
||||||
|
nodes = []
|
||||||
|
nodes.extend(selector.xpath('//p').extract())
|
||||||
|
images, icount = [], 1
|
||||||
|
contents = []
|
||||||
|
for node in nodes:
|
||||||
|
sel = scrapy.Selector(text=node, type='html')
|
||||||
|
if sel.css('img ::attr(class)'):
|
||||||
|
icount = self.parse_image(
|
||||||
|
selector=sel, src='img', resource=images,
|
||||||
|
count=icount, content=contents, src_attr='@data-src')
|
||||||
|
elif sel.css('p') and '{margin:' not in node:
|
||||||
|
self.parse_paragraph(sel, contents)
|
||||||
|
elif sel.css('center ::text'):
|
||||||
|
self.parse_paragraph(sel, contents, tag='center')
|
||||||
|
|
||||||
|
item = {
|
||||||
|
"title": title,
|
||||||
|
"subtitle": subtitle,
|
||||||
|
"url": url,
|
||||||
|
"project_1": {
|
||||||
|
'design_company': design_company,
|
||||||
|
'location': location,
|
||||||
|
'type_c': type_c,
|
||||||
|
'material': material,
|
||||||
|
'label': label,
|
||||||
|
'classification': classification
|
||||||
|
},
|
||||||
|
"contents": contents,
|
||||||
|
"images": images
|
||||||
|
}
|
||||||
|
list_res.append(item)
|
||||||
|
self.file.write_json('结果表', list_res)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.clean()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
clean_detail = CleanDetail()
|
||||||
|
clean_detail.run()
|
0
gooood/main.py
Normal file
0
gooood/main.py
Normal file
0
gooood/spider/__init__.py
Normal file
0
gooood/spider/__init__.py
Normal file
31
gooood/spider/gooood_detail.py
Normal file
31
gooood/spider/gooood_detail.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
from dao.file_dao import File
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
file = File()
|
||||||
|
|
||||||
|
|
||||||
|
def gooood_detail(address):
|
||||||
|
url = f"https://www.gooood.cn/{address}.htm"
|
||||||
|
payload = {}
|
||||||
|
headers = {'cookie': 'language=zh_CN;'}
|
||||||
|
response = requests.request("GET", url, headers=headers, data=payload)
|
||||||
|
return response.text
|
||||||
|
|
||||||
|
|
||||||
|
def run():
|
||||||
|
df = file.read('raw_gd_list')
|
||||||
|
response_text_list = []
|
||||||
|
|
||||||
|
for slug in df["slug"].values:
|
||||||
|
print(slug)
|
||||||
|
response_text = gooood_detail(slug)
|
||||||
|
response_text_list.append([response_text])
|
||||||
|
|
||||||
|
status = file.write('raw_gd_detail', response_text_list, ['response_text'])
|
||||||
|
if status:
|
||||||
|
print('详情页面抓取完成')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run()
|
30
gooood/spider/gooood_list.py
Normal file
30
gooood/spider/gooood_list.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
from dao.file_dao import File
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
file = File()
|
||||||
|
|
||||||
|
|
||||||
|
def gooood_list(page):
|
||||||
|
url = f"https://dashboard.gooood.cn/api/wp/v2/posts?page={page}&per_page=20"
|
||||||
|
payload = {}
|
||||||
|
headers = {'cookie': 'language=zh_CN;'}
|
||||||
|
response = requests.request("GET", url, headers=headers, data=payload)
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
def run():
|
||||||
|
slug_list = []
|
||||||
|
for page in range(1, 4):
|
||||||
|
res_list = gooood_list(page)
|
||||||
|
for res in res_list:
|
||||||
|
slug = res.get('slug')
|
||||||
|
slug_list.append([slug])
|
||||||
|
|
||||||
|
status = file.write('raw_gd_list', slug_list, ['slug'])
|
||||||
|
if status:
|
||||||
|
print('列表页面抓取完成')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run()
|
Loading…
x
Reference in New Issue
Block a user