Comments (5)
上一个简单的代码看看
from feapder.
上一个简单的代码看看
这里我把代理api改掉了 其他的没有改动 原封不动
class S1688(feapder.AirSpider):
__custom_setting__ = {
# "USE_SESSION": True,
"SPIDER_THREAD_COUNT": 12,
"PROXY_ENABLE": True,
# "SPIDER_SLEEP_TIME": [2, 5],
# "LOG_LEVEL": "INFO",
"PROXY_EXTRACT_API": "http://v2.api.juliangip.com/dynamic/getips",
}
def download_midware(self, request: Request):
cna = ''.join(random.choices(list('DwftHIHbiXICAQHA8429Gdvc'), k=24))
request.headers = {
"Cookie": f"cna={cna};",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42"
}
return request
def exception_request(self, request: Request, response, e):
request.proxies_pool.tag_proxy(request.requests_kwargs.get("proxies"), -1)
def start_requests(self):
dire_list = load_dire_list()
for dire in dire_list:
url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={parse.quote('%s %s 中药材' % (dire['dire_name'], dire['dire_location']), encoding='gbk')}&spm="
yield feapder.Request(url=url, dire=dire, verify=False)
def parse(self, request: Request, response: Response):
dire = request.dire
try:
data_str_result = re.search("window.data.offerresultData = successDataCheck\((.*)\)", response.text)
if not data_str_result: return
data_str = data_str_result.group(1)
data = json.loads(data_str)['data']
offer_list = data.get("offerList", [])
for drug in offer_list:
drug_id = drug['id']
title = drug['information']['subject']
if dire['dire_name'] not in title: continue
drug.update(dire)
yield feapder.Request(url=f"https://detail.1688.com/offer/{drug_id}.html",
callback=self.parse_detail, dire=dire, dire_item=drug)
except Exception as e:
request.proxies_pool.tag_proxy(request.requests_kwargs.get("proxies"), -1)
raise Exception(f"链接:{request.url}被触发风控,无法正常获取数据,尝试重试!")
def parse_detail(self, request: Request, response: Response):
dire_item = request.dire
drug_info = request.dire_item
data_match_str_result = re.search("window.__INIT_DATA=(.*)", response.text)
if not data_match_str_result: return
data_match_str = data_match_str_result.group(1)
data = json.loads(data_match_str)
item = Item()
item.item_name = "s1688"
item.table_name = item.item_name
item.update({"tempModel": data['globalData']['tempModel']})
item.update({"skuInfoMap": data['globalData']['skuModel']['skuInfoMap']})
item.update({"skuModel": data['globalData']['skuModel']})
item.update({"orderParam": data['globalData']['orderParamModel']['orderParam']})
attr_param_arr = []
module = next((x for x in data['modules'] if x['name'] == '@ali/tdmod-od-pc-layout-detail-tab-container'), None)
if module is not None:
children = module.get('children', [])
if len(children) > 0:
attr_param_arr = [i for i in children if i['name'] == '@ali/tdmod-od-pc-attribute-new']
if not attr_param_arr: return
attr_param = attr_param_arr[0]
item.update({"attrList": data['data'][attr_param['uuid']]['data']})
item['search_key'] = dire_item['dire_name']
item['dire_spec'] = dire_item['dire_spec']
item['ID'] = dire_item['ID']
item.update(drug_info)
log.info(f"s1688 {item['ID']}-{item['tempModel']['offerTitle']}-{item['search_key']}-{item['dire_spec']}")
yield item
from feapder.
能留个QQ?或者其他联系方式?
from feapder.
能留个QQ?或者其他联系方式?
1577134779
from feapder.
代理模块 是打算废掉重写的,现在用起来比较麻烦。你可以先自己写个代理池,等我这边封装好了你再用我这个
from feapder.
Related Issues (20)
- 使用selenium或者PlayWright 都指定了thread_count 但是无法打开多个浏览器
- 在使用playwright的时候总是报错,sync_playwright().start()这里会报错 HOT 2
- UpdateItem 批量更新数据问题 HOT 2
- 解析不了web,python3.11、feapder1.8.5 HOT 1
- 单机多进程模式下,MySQL 连接报错:通常每个套接字地址(协议/网络地址/端口)只允许使用一次 HOT 1
- 想要在start()启动爬虫的时候可以携带可变的初始url作为参数
- render=True报错 HOT 3
- 如何在自定义下载器中启用setting中配置的代理? HOT 3
- feapder v1.8.8 使用代理IP报错? HOT 2
- mysql 查询 有bug HOT 2
- 指定parser_name和callback不生效
- 關於BatchSpider
- PLAYWRIGHT 默认开启无痕 HOT 2
- feapder requests能支持curl_cffi类似绕过指纹吗 HOT 2
- 批次爬虫获取redis中的任务时一直阻塞,任务存在就是不去执行,重新执行采集程序又正常了 HOT 3
- response.re_first()报错 HOT 1
- 浏览器渲染功能是否可以添加支持drissionpage库? HOT 4
- mongo使用url连接副本集,不能使用yeild item来进行存储 HOT 1
- 如何让AirSpider在调用时每次的浏览器实例都为最新 HOT 1
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from feapder.