Code Monkey home page Code Monkey logo

weibolist's Issues

我发现json.get('data').get('cardlistInfo')并没有cardlistInfo,只有page

2021年12月9日20:40:04
json.get('data').get('cardlistInfo').get('cardlistInfo')会返回None.
可用代码:
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
base_url='https://m.weibo.cn/api/container/getIndex?'
headers={
'Host':'m.weibo.cn/',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
}

def get_one_page(page):
params={
'type':'uid',
#这两个数字在https://m.weibo.cn/u/7198559139?uid=7198559139&luicode=10000011&lfid=100103type%3D1%26q%3D%E4%B8%83%E6%B5%B7里找
#containerid在network-最大的xhr文件-url里面
'value':'7198559139',
'containerid':'1076037198559139'
}
if page!=0:
params['page']=page
url=base_url+urlencode(params)
print(url)
try:
response=requests.get(url)
if response.status_code==200:
return response.json()
except requests.ConnectionError as e:
print(e.args)

def parse_page(json):
global since_id
print(since_id)
if json:
items=json.get('data').get('cards')
for item in items:
item=item.get('mblog')
weibo={}
weibo['id']=item.get('id')
weibo['text']=pq(item.get('text')).text()
weibo['attitudes']=item.get('attitudes_count')
weibo['comments']=item.get('comments_count')
weibo['reposts']=item.get('reposts_count')
yield weibo

#并不需要cardlistInfo,只需要page
with open('nanami.txt','w',encoding='utf-8')as f:
for page in range(1,100):
json=get_one_page(page)
since_id=json.get('data').get('cardlistInfo').get('page')
results=parse_page(json)
for result in results:
f.write(str(result))
f.write('\n')

运行出错

DeprecationWarning: insert is deprecated. Use insert_one or insert_many instead.
if collection.insert(result):
Traceback (most recent call last):
File "E:/DongXuXiang/PycharmProjects/test/ajex.py", line 59, in
for result in results:
File "E:/DongXuXiang/PycharmProjects/test/ajex.py", line 42, in parse_page
weibo['id'] = item.get('id')
AttributeError: 'NoneType' object has no attribute 'get'
运行的时候出错,大佬什么原因啊,小白,讲解的详细点,谢谢

小白,改了一个晚上总算能正常运行了

`import requests
from urllib.parse import urlencode
from requests import codes
import os
from hashlib import md5
from multiprocessing.pool import Pool
base_url='https://m.weibo.cn/api/container/getIndex?'
headers={
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36,',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params={
'type':'uid',
'value':'2830678474',
'containerid':'1076032830678474',
'page': page
}
url=base_url+urlencode(params)
try:
resp = requests.get(url,headers=headers)
return resp.json()
except requests.ConnectionError as e:
print('error',e.args)
from pyquery import PyQuery as pq

def parse_page(json):
if json:
items=json.get('data').get('cards')
for item in items:
item=item.get('mblog')
weibo={}
if not item:#这一句是关键,在返回的数据中有的不包含下面的内容
continue
weibo['id']=item.get('id')
weibo['text']=pq(item.get('text')).text()
weibo['attitudes']=item.get('attitudes_count')
weibo['comments']=item.get('comments_count')
weibo['reposts']=item.get('repost_count')

        yield  weibo

        # if json:
        #     items = json.get('data').get('cards')
        #     for item in items:
        #         print(type(item))
        #         item = item.get('mblog')
        #         weibo = {}
        #         print(type(item))
        #         weibo['id'] = item.get('id')
        #         weibo['text'] = pq(item.get('text')).text()
        #         weibo['attitudes'] = item.get('attitudes_count')
        #         weibo['comments'] = item.get('comments_count')
        #         weibo['reposts'] = item.get('reposts_count')
        #         yield weibo

if name == 'main':
for page in range(1,11):
json=get_page(page)
results=parse_page(json)
for result in results:
print(result)

`
代码如上,因为 返回的数据中,有点条目没数据
image

page参数已改成since_id,有些ajax技术上的问题

如题,page参数已改成since_id,已经发现下一个ajax请求的since_id在上一个since_id中。故解析json的时候,不仅要返回列表,还要返回since_id。又由于本人对yield函数不太理解,故直接再定义了一个新函数来专门负责返回since_id。(代码附后)

这看起来应当是可行的,看网页返回的ajax文件就能发现since_id = json.get('data').get('cardlistInfo').get('since_id')。但是,当我进行调试时,却发现python请求得到的json中,恰恰缺少了since_id一项,这是怎么回事呢?

import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq

base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/u/2492465520',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}

def get_url(since_id):
    params = {
        'type': 'uid',
        'value': '2492465520',
        'containerid': '1076032492465520',
        'since_id': since_id
    }
    url = base_url + urlencode(params)
    return url

def get_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError as e:
        print('Error', e.args)

def parse_page(json):
    if json:
        items = json.get('data').get('cards')
        for index, item in enumerate(items):
            # if page == 1 and index == 1:
                # continue
            # else:
            item = item.get('mblog')
            weibo = {}
            weibo['id'] = item.get('id')
            weibo['text'] = pq(item.get('text')).text()
            # weibo['attitudes'] = item.get('attitudes_count')
            # weibo['comments'] = item.get('comments_count')
            # weibo['reposts'] = item.get('reposts_count')
            yield weibo

def parse_engine(json, i):
    if i > 0:
        # results = parse_page(json)
        # for result in results:
        #     print(result)
        since_id = json.get('data').get('cardlistInfo').get('since_id')
        print(since_id)
        next_url = get_url(since_id)
        next_json = get_page(next_url)
        return parse_engine(next_json, i - 1)

if __name__ == '__main__':

    url = get_url('4293511823942483')
    i = 10
    json = get_page(url)
    parse_engine(json, i)

        

运行出错

Traceback (most recent call last):
File "D:/test-ajax/mayunweibo.py", line 53, in
for result in results:
File "D:/test-ajax/mayunweibo.py", line 34, in parse_page
for item in items:
TypeError: 'NoneType' object is not iterable

找了半天,也不知道如何解决

ajax请求

ajax请求命令最后的page=改成sinceid=了,还有办法抓么?

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.