This commit is contained in:
luzhisheng 2023-05-11 10:51:13 +08:00
parent e9c65585cb
commit bde9b2fe55
11 changed files with 337 additions and 0 deletions

13
gooood/README.md Normal file
View File

@ -0,0 +1,13 @@
# 案例
此网站爬虫部分简单清洗逻辑繁琐一点正文内容以后台生成html展示。
**项目结构**
1. docs是存储结果目录
2. spider是爬虫文件目录
3. extrator是解析文件目录
**几个问题点**
1. 这只是个简单案例,此项目没有测试用例,无法保证代码长期正常运行
2. 项目信息2逻辑比较琐碎时间有限暂不处理

0
gooood/dao/__init__.py Normal file
View File

34
gooood/dao/file_dao.py Normal file
View File

@ -0,0 +1,34 @@
import pandas as pd
import json
class File(object):
def __init__(self):
pass
def write_json(self, filename, data_list):
with open(f'../docs/{filename}.json', 'w') as f:
for item in data_list:
json.dump(item, f)
f.write("\n")
def write(self, filename, data_list, columns_list):
filename = f'../docs/{filename}.csv'
try:
frame = pd.DataFrame(
data_list,
columns=columns_list
)
frame.to_csv(filename)
except Exception as e:
raise e
return True
def read(self, filename):
try:
filename = f'../docs/{filename}.csv'
frame = pd.read_csv(filename)
except Exception as e:
raise e
return frame

File diff suppressed because one or more lines are too long

View File

64
gooood/extrator/base.py Normal file
View File

@ -0,0 +1,64 @@
from setting import PLACEHOLDER
from w3lib.html import remove_tags
class BaseExtractor(object):
def parse_paragraph(self, selector, content_list, tag='p'):
"""
arguemnts:
selector: scrapy.Selector.
content_list: article content list.
tag: paragraph tag.
"""
text = ''.join(selector.css(tag).xpath(
'node()').extract()).strip()
if text:
text = remove_tags(
text, which_ones=('a', 'meta', 'script', 'span'))
content_list.append(text)
def parse_image(self, **kwargs):
'''
arguemnts:
selector: scrapy.Selector.
src: image src css path.
desc: description css path, if desc_path doesn't
exist, then description is a empty string.
content: article content list.
resource: article resource list.
count: image counter.
'''
kwargs['type'] = 'image'
return self.parse_resource(**kwargs)
def parse_resource(self, **kwargs):
placeholder = PLACEHOLDER[kwargs['type']]
count = kwargs['count']
content_list = kwargs['content']
resource_list = kwargs['resource']
sel = kwargs['selector']
src = '@src'
if 'src_attr' in kwargs:
src = kwargs['src_attr']
src = sel.css(kwargs['src']).xpath(src).extract()
desc = ''
if 'desc' in kwargs:
desc = ''.join(sel.css(kwargs['desc']).extract()).strip()
if len(src) > 1:
for s in src:
ref = placeholder.format(count)
res = {'src': s, 'desc': desc, 'ref': ref}
resource_list.append(res)
content_list.append(ref)
count += 1
return count
src = src[0]
if not src:
return count
ref = placeholder.format(count)
count += 1
res = {'ref': ref, 'src': src, 'desc': desc}
resource_list.append(res)
content_list.append(res['ref'])
return count

View File

@ -0,0 +1,105 @@
from base import BaseExtractor
import json
import re
import scrapy
from dao.file_dao import File
from lxml import etree
class CleanDetail(BaseExtractor):
def __init__(self):
self.file = File()
def clean(self):
df = self.file.read('raw_gd_detail')
list_res = []
for response_text in df["response_text"].values:
selector = etree.HTML(response_text)
title = selector.xpath('//h1/text()')[0]
subtitle = selector.xpath('//div[contains(@class,"single-content")]/h2/text()')[0]
url = selector.xpath('//meta[@property="og:url"]/@content')[0]
design_company_str = selector.xpath(
'//span[contains(text(), "设计公司")]/../following-sibling::div[1]/a/text()')
if design_company_str:
design_company = design_company_str[0]
else:
design_company = ''
location_str = selector.xpath('//span[contains(text(), "位置")]/../following-sibling::div[1]/a/text()')
if location_str:
location = location_str[0]
else:
location = ''
type_c_str = selector.xpath('//span[contains(text(), "类型")]/../following-sibling::div[1]/a/text()')
if type_c_str:
type_c = type_c_str[0]
else:
type_c = ''
material_str = selector.xpath('//span[contains(text(), "材料")]/../following-sibling::div[1]/a/text()')
if material_str:
material = material_str[0]
else:
material = ''
label_str = selector.xpath(
'//span[contains(text(), "标签")]/../following-sibling::div[1]/a/text()')
if label_str:
label = label_str
else:
label = []
classification_str = selector.xpath(
'//span[contains(text(), "分类")]/../following-sibling::div[1]/a/text()')
if classification_str:
classification = classification_str[0]
else:
classification = ''
item = re.findall(r'window.__INITIAL_STATE__=(.*?)</script>', response_text)[0]
item_dict = json.loads(item)
content_html = item_dict.get('post').get('content')
selector = scrapy.Selector(text=content_html, type='html')
nodes = []
nodes.extend(selector.xpath('//p').extract())
images, icount = [], 1
contents = []
for node in nodes:
sel = scrapy.Selector(text=node, type='html')
if sel.css('img ::attr(class)'):
icount = self.parse_image(
selector=sel, src='img', resource=images,
count=icount, content=contents, src_attr='@data-src')
elif sel.css('p') and '{margin:' not in node:
self.parse_paragraph(sel, contents)
elif sel.css('center ::text'):
self.parse_paragraph(sel, contents, tag='center')
item = {
"title": title,
"subtitle": subtitle,
"url": url,
"project_1": {
'design_company': design_company,
'location': location,
'type_c': type_c,
'material': material,
'label': label,
'classification': classification
},
"contents": contents,
"images": images
}
list_res.append(item)
self.file.write_json('结果表', list_res)
def run(self):
self.clean()
if __name__ == '__main__':
clean_detail = CleanDetail()
clean_detail.run()

0
gooood/main.py Normal file
View File

View File

View File

@ -0,0 +1,31 @@
from dao.file_dao import File
import requests
file = File()
def gooood_detail(address):
url = f"https://www.gooood.cn/{address}.htm"
payload = {}
headers = {'cookie': 'language=zh_CN;'}
response = requests.request("GET", url, headers=headers, data=payload)
return response.text
def run():
df = file.read('raw_gd_list')
response_text_list = []
for slug in df["slug"].values:
print(slug)
response_text = gooood_detail(slug)
response_text_list.append([response_text])
status = file.write('raw_gd_detail', response_text_list, ['response_text'])
if status:
print('详情页面抓取完成')
if __name__ == '__main__':
run()

View File

@ -0,0 +1,30 @@
from dao.file_dao import File
import requests
file = File()
def gooood_list(page):
url = f"https://dashboard.gooood.cn/api/wp/v2/posts?page={page}&per_page=20"
payload = {}
headers = {'cookie': 'language=zh_CN;'}
response = requests.request("GET", url, headers=headers, data=payload)
return response.json()
def run():
slug_list = []
for page in range(1, 4):
res_list = gooood_list(page)
for res in res_list:
slug = res.get('slug')
slug_list.append([slug])
status = file.write('raw_gd_list', slug_list, ['slug'])
if status:
print('列表页面抓取完成')
if __name__ == '__main__':
run()