2020/2/10 更新,见 Issue:#9
python3webspider / weibolist Goto Github PK
View Code? Open in Web Editor NEWWeiboList of MaYun
WeiboList of MaYun
2020/2/10 更新,见 Issue:#9
2021年12月9日20:40:04
json.get('data').get('cardlistInfo').get('cardlistInfo')会返回None.
可用代码:
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
base_url='https://m.weibo.cn/api/container/getIndex?'
headers={
'Host':'m.weibo.cn/',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
}
def get_one_page(page):
params={
'type':'uid',
#这两个数字在https://m.weibo.cn/u/7198559139?uid=7198559139&luicode=10000011&lfid=100103type%3D1%26q%3D%E4%B8%83%E6%B5%B7里找
#containerid在network-最大的xhr文件-url里面
'value':'7198559139',
'containerid':'1076037198559139'
}
if page!=0:
params['page']=page
url=base_url+urlencode(params)
print(url)
try:
response=requests.get(url)
if response.status_code==200:
return response.json()
except requests.ConnectionError as e:
print(e.args)
def parse_page(json):
global since_id
print(since_id)
if json:
items=json.get('data').get('cards')
for item in items:
item=item.get('mblog')
weibo={}
weibo['id']=item.get('id')
weibo['text']=pq(item.get('text')).text()
weibo['attitudes']=item.get('attitudes_count')
weibo['comments']=item.get('comments_count')
weibo['reposts']=item.get('reposts_count')
yield weibo
#并不需要cardlistInfo,只需要page
with open('nanami.txt','w',encoding='utf-8')as f:
for page in range(1,100):
json=get_one_page(page)
since_id=json.get('data').get('cardlistInfo').get('page')
results=parse_page(json)
for result in results:
f.write(str(result))
f.write('\n')
if page == 1 and index == 1:
continue
在方法parse_page作何用处,是否可以删除
DeprecationWarning: insert is deprecated. Use insert_one or insert_many instead.
if collection.insert(result):
Traceback (most recent call last):
File "E:/DongXuXiang/PycharmProjects/test/ajex.py", line 59, in
for result in results:
File "E:/DongXuXiang/PycharmProjects/test/ajex.py", line 42, in parse_page
weibo['id'] = item.get('id')
AttributeError: 'NoneType' object has no attribute 'get'
运行的时候出错,大佬什么原因啊,小白,讲解的详细点,谢谢
Traceback (most recent call last):
File "D:/test-ajax/mayunweibo.py", line 53, in
for result in results:
File "D:/test-ajax/mayunweibo.py", line 34, in parse_page
for item in items:
TypeError: 'NoneType' object is not iterable
找了半天,也不知道如何解决
ajax请求命令最后的page=改成sinceid=了,还有办法抓么?
`import requests
from urllib.parse import urlencode
from requests import codes
import os
from hashlib import md5
from multiprocessing.pool import Pool
base_url='https://m.weibo.cn/api/container/getIndex?'
headers={
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36,',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params={
'type':'uid',
'value':'2830678474',
'containerid':'1076032830678474',
'page': page
}
url=base_url+urlencode(params)
try:
resp = requests.get(url,headers=headers)
return resp.json()
except requests.ConnectionError as e:
print('error',e.args)
from pyquery import PyQuery as pq
def parse_page(json):
if json:
items=json.get('data').get('cards')
for item in items:
item=item.get('mblog')
weibo={}
if not item:#这一句是关键,在返回的数据中有的不包含下面的内容
continue
weibo['id']=item.get('id')
weibo['text']=pq(item.get('text')).text()
weibo['attitudes']=item.get('attitudes_count')
weibo['comments']=item.get('comments_count')
weibo['reposts']=item.get('repost_count')
yield weibo
# if json:
# items = json.get('data').get('cards')
# for item in items:
# print(type(item))
# item = item.get('mblog')
# weibo = {}
# print(type(item))
# weibo['id'] = item.get('id')
# weibo['text'] = pq(item.get('text')).text()
# weibo['attitudes'] = item.get('attitudes_count')
# weibo['comments'] = item.get('comments_count')
# weibo['reposts'] = item.get('reposts_count')
# yield weibo
if name == 'main':
for page in range(1,11):
json=get_page(page)
results=parse_page(json)
for result in results:
print(result)
如题,page参数已改成since_id,已经发现下一个ajax请求的since_id在上一个since_id中。故解析json的时候,不仅要返回列表,还要返回since_id。又由于本人对yield函数不太理解,故直接再定义了一个新函数来专门负责返回since_id。(代码附后)
这看起来应当是可行的,看网页返回的ajax文件就能发现since_id = json.get('data').get('cardlistInfo').get('since_id')
。但是,当我进行调试时,却发现python请求得到的json中,恰恰缺少了since_id一项,这是怎么回事呢?
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2492465520',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_url(since_id):
params = {
'type': 'uid',
'value': '2492465520',
'containerid': '1076032492465520',
'since_id': since_id
}
url = base_url + urlencode(params)
return url
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json):
if json:
items = json.get('data').get('cards')
for index, item in enumerate(items):
# if page == 1 and index == 1:
# continue
# else:
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text()
# weibo['attitudes'] = item.get('attitudes_count')
# weibo['comments'] = item.get('comments_count')
# weibo['reposts'] = item.get('reposts_count')
yield weibo
def parse_engine(json, i):
if i > 0:
# results = parse_page(json)
# for result in results:
# print(result)
since_id = json.get('data').get('cardlistInfo').get('since_id')
print(since_id)
next_url = get_url(since_id)
next_json = get_page(next_url)
return parse_engine(next_json, i - 1)
if __name__ == '__main__':
url = get_url('4293511823942483')
i = 10
json = get_page(url)
parse_engine(json, i)
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.