python3webspider / weibolist Goto Github PK

View Code? Open in Web Editor NEW

65.0 8.0 93.0 3 KB

WeiboList of MaYun

Python 100.00%

weibolist's Introduction

WeiboList

2020/2/10 更新，见 Issue：#9

weibolist's People

Stargazers

Watchers

Forkers

aegeansea kujiran xclu lxstrive 0xff-dev mortyxu junqiangle hxf175336 dophenxd nofuture123 zzgbird wlinjie yewenchao0102 chengccj zhourubin taotaolt729 thh2018 chen8566 wsh888666 fightyang shizeren967 liu139825 snowdj dex7erhan liwenwei donaldxialiu wnight9527 wgwjifeng liangjunhao xzxin devotionfor lllllllai27 xiaolinpeter sunaque keepwork bianzhuo pityk3369 mldxsj guangpigu yyi ccccyril zhudunfeng smilemilk1992 holysll crystalguo0312 danddyzhong jamesshang87 jiangphcn litouwuu liuqing0918 captainw066 hongru303 penghaoliang myguiye ly-lee klose503 zhangdashenqi frankzhiyuan zxwsbg liugh2020 apmfifty lucbuing qcj1206 ray-jason yumouren doublehok xueshanlinghu coffeeg2ek zoeyyj hitid foxgeek36 orchakcha vagrant-h potato49 foryaoyaohoother linshibin thhou lengkeyu chexia159 chenglong-s roceys mrfangyong vonkonyoung shanshanlulu huang-biao liuzhh1366 spole0168 tzphh jackychen0 lhongjum feng1433 lokyii wood-north

weibolist's Issues

我发现json.get('data').get('cardlistInfo')并没有cardlistInfo，只有page

2021年12月9日20:40:04
json.get('data').get('cardlistInfo').get('cardlistInfo')会返回None.
可用代码：
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
base_url='https://m.weibo.cn/api/container/getIndex?'
headers={
'Host':'m.weibo.cn/',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
}

def get_one_page(page):
params={
'type':'uid',
#这两个数字在https://m.weibo.cn/u/7198559139?uid=7198559139&luicode=10000011&lfid=100103type%3D1%26q%3D%E4%B8%83%E6%B5%B7里找
#containerid在network-最大的xhr文件-url里面
'value':'7198559139',
'containerid':'1076037198559139'
}
if page!=0:
params['page']=page
url=base_url+urlencode(params)
print(url)
try:
response=requests.get(url)
if response.status_code==200:
return response.json()
except requests.ConnectionError as e:
print(e.args)

def parse_page(json):
global since_id
print(since_id)
if json:
items=json.get('data').get('cards')
for item in items:
item=item.get('mblog')
weibo={}
weibo['id']=item.get('id')
weibo['text']=pq(item.get('text')).text()
weibo['attitudes']=item.get('attitudes_count')
weibo['comments']=item.get('comments_count')
weibo['reposts']=item.get('reposts_count')
yield weibo

#并不需要cardlistInfo，只需要page
with open('nanami.txt','w',encoding='utf-8')as f:
for page in range(1,100):
json=get_one_page(page)
since_id=json.get('data').get('cardlistInfo').get('page')
results=parse_page(json)
for result in results:
f.write(str(result))
f.write('\n')

if page == 1 and index == 1: continue 作何用处

        if page == 1 and index == 1:
            continue

在方法parse_page作何用处，是否可以删除

运行出错

DeprecationWarning: insert is deprecated. Use insert_one or insert_many instead.
if collection.insert(result):
Traceback (most recent call last):
File "E:/DongXuXiang/PycharmProjects/test/ajex.py", line 59, in
for result in results:
File "E:/DongXuXiang/PycharmProjects/test/ajex.py", line 42, in parse_page
weibo['id'] = item.get('id')
AttributeError: 'NoneType' object has no attribute 'get'
运行的时候出错，大佬什么原因啊，小白，讲解的详细点，谢谢

运行出错

Traceback (most recent call last):
File "D:/test-ajax/mayunweibo.py", line 53, in
for result in results:
File "D:/test-ajax/mayunweibo.py", line 34, in parse_page
for item in items:
TypeError: 'NoneType' object is not iterable

找了半天，也不知道如何解决

ajax请求

ajax请求命令最后的page=改成sinceid=了，还有办法抓么？

小白，改了一个晚上总算能正常运行了

`import requests
from urllib.parse import urlencode
from requests import codes
import os
from hashlib import md5
from multiprocessing.pool import Pool
base_url='https://m.weibo.cn/api/container/getIndex?'
headers={
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36,',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params={
'type':'uid',
'value':'2830678474',
'containerid':'1076032830678474',
'page': page
}
url=base_url+urlencode(params)
try:
resp = requests.get(url,headers=headers)
return resp.json()
except requests.ConnectionError as e:
print('error',e.args)
from pyquery import PyQuery as pq

def parse_page(json):
if json:
items=json.get('data').get('cards')
for item in items:
item=item.get('mblog')
weibo={}
if not item:#这一句是关键，在返回的数据中有的不包含下面的内容
continue
weibo['id']=item.get('id')
weibo['text']=pq(item.get('text')).text()
weibo['attitudes']=item.get('attitudes_count')
weibo['comments']=item.get('comments_count')
weibo['reposts']=item.get('repost_count')

        yield  weibo

        # if json:
        #     items = json.get('data').get('cards')
        #     for item in items:
        #         print(type(item))
        #         item = item.get('mblog')
        #         weibo = {}
        #         print(type(item))
        #         weibo['id'] = item.get('id')
        #         weibo['text'] = pq(item.get('text')).text()
        #         weibo['attitudes'] = item.get('attitudes_count')
        #         weibo['comments'] = item.get('comments_count')
        #         weibo['reposts'] = item.get('reposts_count')
        #         yield weibo

if name == 'main':
for page in range(1,11):
json=get_page(page)
results=parse_page(json)
for result in results:
print(result)

`
代码如上，因为返回的数据中，有点条目没数据

page参数已改成since_id，有些ajax技术上的问题

如题，page参数已改成since_id，已经发现下一个ajax请求的since_id在上一个since_id中。故解析json的时候，不仅要返回列表，还要返回since_id。又由于本人对yield函数不太理解，故直接再定义了一个新函数来专门负责返回since_id。（代码附后）

这看起来应当是可行的，看网页返回的ajax文件就能发现since_id = json.get('data').get('cardlistInfo').get('since_id')。但是，当我进行调试时，却发现python请求得到的json中，恰恰缺少了since_id一项，这是怎么回事呢？

import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq

base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/u/2492465520',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}

def get_url(since_id):
    params = {
        'type': 'uid',
        'value': '2492465520',
        'containerid': '1076032492465520',
        'since_id': since_id
    }
    url = base_url + urlencode(params)
    return url

def get_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError as e:
        print('Error', e.args)

def parse_page(json):
    if json:
        items = json.get('data').get('cards')
        for index, item in enumerate(items):
            # if page == 1 and index == 1:
                # continue
            # else:
            item = item.get('mblog')
            weibo = {}
            weibo['id'] = item.get('id')
            weibo['text'] = pq(item.get('text')).text()
            # weibo['attitudes'] = item.get('attitudes_count')
            # weibo['comments'] = item.get('comments_count')
            # weibo['reposts'] = item.get('reposts_count')
            yield weibo

def parse_engine(json, i):
    if i > 0:
        # results = parse_page(json)
        # for result in results:
        #     print(result)
        since_id = json.get('data').get('cardlistInfo').get('since_id')
        print(since_id)
        next_url = get_url(since_id)
        next_json = get_page(next_url)
        return parse_engine(next_json, i - 1)

if __name__ == '__main__':

    url = get_url('4293511823942483')
    i = 10
    json = get_page(url)
    parse_engine(json, i)

python3webspider / weibolist Goto Github PK

weibolist's Introduction

WeiboList

weibolist's People

Stargazers

Watchers

Forkers

weibolist's Issues

我发现json.get('data').get('cardlistInfo')并没有cardlistInfo，只有page

if page == 1 and index == 1: continue 作何用处

运行出错

运行出错

ajax请求

小白，改了一个晚上总算能正常运行了

page参数已改成since_id，有些ajax技术上的问题

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent