QQ群信息爬取-------使用模拟登陆

Source:网络 Release time:2019年09月10日 23点43分10秒 Author:遇见 Reading volume:475

本文链接:https://blog.csdn.net/a_hui_tai_lang/article/details/82800286

需要安装谷歌浏览器,下载chrome.exe放到python的安装路径下

#coding=utf-8
from lxml import etree
import time
from selenium import webdriver

class qqGroupSpider():
    '''
    Q群爬虫类
    '''
    def __init__(self, driver,qq,passwd,qqgroup,writefile):
        '''
        初始化根据用户信息登录到Q群管理界面
        :param driver:
        :param qq:
        :param passwd:
        :param qqgroup:
        :param writefile:
        '''
        url = "https://qun.qq.com/member.html#gid=" + str(qqgroup)
        self.writefile=writefile
        self.driver=driver
        driver.delete_all_cookies()
        driver.get(url)
        time.sleep(1)
        driver.switch_to.frame("login_frame")  # 进入登录iframe
        time.sleep(1)
        change = driver.find_element_by_id("switcher_plogin")
        change.click()
        driver.find_element_by_id('u').clear()  # 选择用户名框
        driver.find_element_by_id('u').send_keys(qq)
        driver.find_element_by_id('p').clear()
        driver.find_element_by_id('p').send_keys(passwd)
        driver.find_element_by_class_name("login_button").click()
        time.sleep(1)

    def scroll_foot(self,driver):
        '''
        控制屏幕向下滚动一下
        :param driver:
        :return:
        '''
        js = "var q=document.documentElement.scrollTop=100000"
        return driver.execute_script(js)

    def getTbodyList(self, driver):
        return driver.find_elements_by_xpath('//div[@class="group-memeber"]//tbody[contains(@class,"list")]')

    def parseMember(self, mb):
        '''
        解析每个人各项描述,以逗号隔开,返回一个成员的基本情况
        :param mb:
        :return:
        '''
        master = mb.getchildren()[2].getchildren()[0].get('class')
        if master == None:
            master = '0'
        else:
            master = '1'
        qId = mb.getchildren()[1].text.strip()
        nickName = mb.getchildren()[2].getchildren()[2].text.strip()
        card = mb.getchildren()[3].getchildren()[0].text.strip()
        qq = mb.getchildren()[4].text.strip()
        sex = mb.getchildren()[5].text.strip()
        qqAge = mb.getchildren()[6].text.strip()
        joinTime = mb.getchildren()[7].text.strip()
        lastTime = mb.getchildren()[8].text.strip()
        return (
        master + "," + qq + "," + nickName + "," + card + "," + sex + "," + qqAge + "," + joinTime + "," + lastTime).encode(
            'utf-8')

    def parseTbody(self, html):
        '''
        解析tbody里面的内容,一个tbody里面有多个成员,
        解析完成后,返回成员基本情况的列表
        :param html:
        :return:
        '''
        selector = etree.HTML(html)
        mbs = selector.xpath('//tr[contains(@class,"mb mb")]')
        memberList = map(self.parseMember, mbs)
        return memberList

    def parseAndWrite(self, tbody):
        '''
        解析HTML中的tbody,解析完成后写入到本地文件
        :param tbody:
        :return:
        '''
        html = tbody.get_attribute('innerHTML')
        memberList = self.parseTbody(html)
        map(lambda x: self.writefile.write(x + '\n'), memberList)



def main():
    # filename = driver.find_element_by_xpath('//*[@id="groupTit"]').text.encode('utf-8').strip()
    # file = open(unicode('qq/' + filename.replace('/', '').replace('\\', '') + '.txt', 'utf-8'), 'w')
    qq = str(raw_input("请输入你的QQ:"))
    passwd = str(raw_input("请输入你的QQ密码:"))
    qqgroup = raw_input("请输入QQ群号:")
    filename = str(raw_input("请输入保存的文件名:"))
    # 保存在qq目录下,没有需要先创建
    file = open(unicode('qq/' + filename+ '.txt', 'utf-8'), 'w')
    driver = webdriver.Chrome()
    spider=qqGroupSpider(driver,qq,passwd,qqgroup,file)
    # 找到QQ群的人数
    qqNum = int(driver.find_element_by_xpath('//*[@id="groupMemberNum"]').text.strip())
    curren_qq_num=0
    count=0
    prelen=0
    while curren_qq_num != qqNum:
        # 不停的向下滚动屏幕,直到底部,一边抽取数据
        count = count+1
        print count
        spider.scroll_foot(driver)
        time.sleep(1)
        curren_qq_num=len(driver.find_elements_by_xpath('//*[@id="groupMember"]//td[contains(@class,"td-no")]'))
        tlist = spider.getTbodyList(driver)
        map(spider.parseAndWrite, tlist[prelen:])
        prelen = len(tlist)#更新tbody列表的长度
    driver.quit()
    file.close()

if __name__ == '__main__':
    main()

运行如下:



爬取出的数据保存成文本文件,格式:是否群管理,昵称,群名片,QQ号,性别,q龄,入群时间,等级,最后发言

代码放在GitHub上,python-learning
以上具体代码在目录other/qq_group.py

重要通知

通过本站建设的所有产品均禁止放置违法违规类内容!!! 违法违规类:仿站、博彩类、...

道者虽羽化,凡间依虔诚。

他是想求个签的,可能大师在打坐修行,始终没有理会我们。 临走时他把他从辽宁营口...

在最深的红尘里重逢——白落梅

所以让这个充满情愫的男子,往后的时光在矛盾与痛苦中感叹:世间安得双全法,不负如来...

李宗盛最经典的10首歌:最怕在某个...

如果你听懂了李宗盛,也许那就是你的人生。 如果没有,那也许就是你一生的幸运...


遇见ni

   I want to leave a message
陇ICP备19002756号