​​​​ python代码段收集 | 苏生不惑的博客

python代码段收集

5 行代码入门 Python 爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
import time
import pymysql
from sqlalchemy import create_engine
from urllib.parse import urlencode # 编码 URL 字符串https://www.makcyun.top/web_scraping_withpython18.html

start_time = time.time() #计算程序运行时间
def get_one_page(i):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
paras = {
'reportTime': '2017-12-31',
#可以改报告日期,比如2018-6-30获得的就是该季度的信息
'pageNum': i #页码
}
url = 'http://s.askci.com/stock/a/?' + urlencode(paras)
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('爬取失败')

def parse_one_page(html):
soup = BeautifulSoup(html,'lxml')
content = soup.select('#myTable04')[0] #[0]将返回的list改为bs4类型
tbl = pd.read_html(content.prettify(),header = 0)[0]
# prettify()优化代码,[0]从pd.read_html返回的list中提取出DataFrame
tbl.rename(columns = {'序号':'serial_number', '股票代码':'stock_code', '股票简称':'stock_abbre', '公司名称':'company_name', '省份':'province', '城市':'city', '主营业务收入(201712)':'main_bussiness_income', '净利润(201712)':'net_profit', '员工人数':'employees', '上市日期':'listing_date', '招股书':'zhaogushu', '公司财报':'financial_report', '行业分类':'industry_classification', '产品类型':'industry_type', '主营业务':'main_business'},inplace = True)
return tbl

def generate_mysql():
conn = pymysql.connect(
host='localhost',
user='root',
password='******',
port=3306,
charset = 'utf8',
db = 'wade')
cursor = conn.cursor()

sql = 'CREATE TABLE IF NOT EXISTS listed_company (serial_number INT(20) NOT NULL,stock_code INT(20) ,stock_abbre VARCHAR(20) ,company_name VARCHAR(20) ,province VARCHAR(20) ,city VARCHAR(20) ,main_bussiness_income VARCHAR(20) ,net_profit VARCHAR(20) ,employees INT(20) ,listing_date DATETIME(0) ,zhaogushu VARCHAR(20) ,financial_report VARCHAR(20) , industry_classification VARCHAR(20) ,industry_type VARCHAR(100) ,main_business VARCHAR(200) ,PRIMARY KEY (serial_number))'
cursor.execute(sql)
conn.close()

def write_to_sql(tbl, db = 'wade'):
engine = create_engine('mysql+pymysql://root:******@localhost:3306/{0}?charset=utf8'.format(db))
try:
tbl.to_sql('listed_company2',con = engine,if_exists='append',index=False)
# append表示在原有表基础上增加,但该表要有表头
except Exception as e:
print(e)

def main(page):
generate_mysql()
for i in range(1,page):
html = get_one_page(i)
tbl = parse_one_page(html)
write_to_sql(tbl)

# # 单进程
if __name__ == '__main__':
main(178)
endtime = time.time()-start_time
print('程序运行了%.2f秒' %endtime)

# 多进程
from multiprocessing import Pool
if __name__ == '__main__':
pool = Pool(4)
pool.map(main, [i for i in range(1,178)]) #共有178页
endtime = time.time()-start_time
print('程序运行了%.2f秒' %(time.time()-start_time))

import pandas as pd
import csv
for i in range(1,178): # 爬取全部页
tb = pd.read_html('http://s.askci.com/stock/a/?reportTime=2017-12-31&pageNum=%s' % (str(i)))[3]
tb.to_csv(r'1.csv', mode='a', encoding='utf_8_sig', header=1, index=0)

时间处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
# author: inpurer(月小水长)
# pc_type lenovo
# create_date: 2018/12/3
# file_name: timetest.py
# description: 月小水长,热血未凉

import time

t0 = time.time()
#description: 输出当前时间距离1970.1.1的秒数,精确到小数点后6位,也叫做时间戳
#output sample: 1543799532.602318
print(t0)

t1 = time.localtime(t0)
#description: 把时间戳转成元组,包含七个元素,前六个见名知意,tm_wday是指今天是当前周的第几天(index from 0),tm_yday类似,tm_isdst是否是夏令时,不用关心
#output sample: time.struct_time(tm_year=2018, tm_mon=12, tm_mday=3, tm_hour=9, tm_min=22, tm_sec=24, tm_wday=0, tm_yday=337, tm_isdst=0)
print(t1)
#so,可以这样输出今天是今年的第多少天
print(t1[-2]+1)


#下面是对该元组的格式化

#description: 简单可读形式
#output sample: Mon Dec 3 09:31:18 2018
t2 = time.asctime(t1)
print(t2)

#description: 可通过参数设置成各种形式,下面是一种标准形式,各参数见名知意
#output sample: 2018-12-03 09:33:36
t3 = time.strftime("%Y-%m-%d %H:%M:%S", t1)
print(t3)
#%y 两位数的年份表示(00-99)
# %Y 四位数的年份表示(000-9999)
# %m 月份(01-12)
# %d 月内中的一天(0-31)
# %H 24小时制小时数(0-23)
# %I 12小时制小时数(01-12)
# %M 分钟数(00=59)
# %S 秒(00-59)
#
# %a 本地简化星期名称
# %A 本地完整星期名称
# %b 本地简化的月份名称
# %B 本地完整的月份名称
# %c 本地相应的日期表示和时间表示
# %j 年内的一天(001-366)
# %p 本地A.M.或P.M.的等价符
# %U 一年中的星期数(00-53)星期天为星期的开始
# %w 星期(0-6),星期天为星期的开始
# %W 一年中的星期数(00-53)星期一为星期的开始
# %x 本地相应的日期表示
# %X 本地相应的时间表示
# %Z 当前时区的名称
# %% %号本身


# 下面是把格式化字符串转成元组
# description: 第一个参数个格式化后的字符串,后一个参数和格式化对应,便于反格式化
# output sample: time.struct_time(tm_year=2018, tm_mon=12, tm_mday=3, tm_hour=9, tm_min=47, tm_sec=7, tm_wday=0, tm_yday=337, tm_isdst=-1)
t4 = time.strptime(t3,'%Y-%m-%d %H:%M:%S')
print(t4)

# 把元组转成时间戳
#description: 是time.localtime的反函数,不过由于格式化的原因,精度有所下降
#output sample: 1543801627.0
t5 = time.mktime(t4)
print(t5)

# -*- coding: utf-8 -*-
# author: inpurer(月小水长)
# pc_type lenovo
# create_date: 2018/12/3
# file_name: timetest.py
# description: 月小水长,热血未凉

import datetime

#通过datetime.datetime.now()可以获得当前日期时间的一个实例
#这个实例是一个datetime类对象而不是字符串
#虽然直接打印该实例输出的是一个字符串,只是调用datetime实现的__str__方法而已
t0 = datetime.datetime.now()
print(t0) #print: 2018-12-03 12:55:49.905971
print(type(t0)) #print: <class 'datetime.datetime'>

#然后就可以通过对象名.的方法输出各个时间信息,该信息是一个int类型
print(t0.year) #print: 2018
print(type(t0.year)) #print: <class 'int'>
print(t0.month)
print(t0.day)
print(t0.hour)
print(t0.minute)
print(t0.second)

import datetime
t0 = datetime.datetime.now()
#注意year/month/day都是int类型,不像java那样可以直接拼接字符串和数字
wanted_time = str(t0.year)+"-"+str(t0.month)+"-"+str(t0.day)
#https://inspurer.github.io/2018/12/03/%E4%B8%80%E6%96%87%E6%90%9E%E5%AE%9Apython%E7%9A%84%E6%97%B6%E9%97%B4%E5%A4%84%E7%90%86/
import time
t0 = time.localtime()
wanted_time = time.strftime("%Y-%m-%d",t0)

error: Microsoft Visual C++ 14.0 is required

1
2
3
4
5
6
7
8
9
源码安装,但是没有 C++ 的编译环境


error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual
C++ Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools
如果是 Python 27 可以安装 Microsoft Visual C++ Compiler for Python 2.7 https://www.microsoft.com/en-us/download/details.aspx?id=44266
如果是 Python 3 可以安装 Visual C++ 2015 Build Tools http://landinghub.visualstudio.com/visual-cpp-build-tools
或者使用下载编译好的 exe 文件
或者使用 whl 格式的包

逗号引发的悲剧

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
>>> a = [
... 'foo'
... 'bar',
... 'tree'
... ]
>>>
>>> b = 'foo' 'bar'
>>>
>>> print a
['foobar', 'tree']
>>> print b
foobar
>>>
也就是说 'foo' + 'bar' 等价于 'foo' 'bar'

再来看另外一个例子,注意第二行后面的逗号

>>> a = {'foo': 'bar'}
>>> b = a.get('foo'),
>>> c = a.get('foo')
>>> print(b)
('bar',)
>>> print(c)
bar
>>>
本来 b 应该是一个字符串,结果硬是变成了元组。https://www.restran.net/2015/11/07/python-comma-issue/

Python获取Bing图片做壁纸

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
http://jeffyang.top/Python/python%E8%8E%B7%E5%8F%96Bing%E5%9B%BE%E7%89%87%E5%81%9A%E5%A3%81%E7%BA%B8/
def get_url(day=0):
url = "https://www.bing.com/HPImageArchive.aspx?format=js&idx=" + str(day) + "&n=1&nc=1509675905008&pid=hp&video=1"
return url
def get_img(url, path="D://wallpaper/"):
isExists = os.path.exists(path)#https://github.com/JianFengY/BingSpider
if not isExists:
os.makedirs(path)
html = requests.get(url)
content = html.json()
src = "https://www.bing.com" + content['images'][0]['url']
urlretrieve(src, path + content['images'][0]['enddate'] + '.jpg')
def set_wallpaper_from_bmp(bmp_path):
reg_key = win32api.RegOpenKeyEx(win32con.HKEY_CURRENT_USER, "Control Panel\\Desktop", 0, win32con.KEY_SET_VALUE)
win32api.RegSetValueEx(reg_key, "WallpaperStyle", 0, win32con.REG_SZ, "2")
win32api.RegSetValueEx(reg_key, "TileWallpaper", 0, win32con.REG_SZ, "0")
win32gui.SystemParametersInfo(win32con.SPI_SETDESKWALLPAPER, bmp_path, win32con.SPIF_SENDWININICHANGE)
def set_wallpaper(img_path):
isExists = os.path.exists(img_path)
if isExists:
img_dir = os.path.dirname(img_path)
bmpImage = Image.open(img_path)
new_bmp_path = os.path.join(img_dir, 'wallpaper.bmp')
bmpImage.save(new_bmp_path, "BMP")
set_wallpaper_from_bmp(new_bmp_path)
return True
else:
return False

自动化测试工具from selenium import webdriver

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
           from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

browser = webdriver.Chrome()
try:
browser.get('https://www.baidu.com')
input = browser.find_element_by_id('kw')
input.send_keys('Python')
input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source)
finally:
browser.close()

browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')

browser = webdriver.Chrome()
browser.implicitly_wait(10)
browser.get('https://www.zhihu.com/explore')
input = browser.find_element_by_class_name('zu-top-add-question')
print(input)

try:
browser.get('https://www.baidu.com')
except TimeoutException:
print('Time Out')
try:
browser.find_element_by_id('hello')
except NoSuchElementException:
print('No Element')
finally:
browser.close()

http://jeffyang.top/Python/%E7%88%AC%E8%99%AB/Python%E7%88%AC%E8%99%AB%E5%B8%B8%E7%94%A8%E5%BA%93selenium%E8%AF%A6%E8%A7%A3/

取交集,并集和差集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
a = ['a','b','c'],b = ['b','c','d']

交集:

print list(set(a).intersection(set(b)))

#或者
isec = [val for val in a if val in b]
print isec

并集
print list(set(a).union(set(b)))
差集
print list(set(b).difference(set(a))) # b-a
a = [[1,2],[3,4],[1,4]]
b = [x for j in a for x in j]
print b
[1, 2, 3, 4, 1, 4]

list分割成固定长度子list

1
2
3
4
5
6
7
8
def splite_list(splist, s):
"""splite a list to sub list contain s"""
return [splist[i:i + s] for i in range(len(splist)) if i % s == 0]

#test
list1 = range(10)
splite_list(list1,2)
[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]

生成英文字母表

1
2
3
4
5
6
7
8
9
10
11
12
13
小写字母表
list(map(chr,list(range(97, 123))))

大写字母
list(map(chr,list(range(65, 91))))

小写字母表
import string
string.ascii_uppercase

大写字母
import string
string.ascii_lowercase

字典排序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
按键排序
dic = {'a':11 , 'b':5 , 'c': 7}

# 升序排序
sorted(dic.keys())

# 降序排序
sorted(dic.keys(), reverse=True)

按值排序
dic = {'a':11 , 'b':5 , 'c': 7}

# 升序
sorted(dic.items(), key = lambda x:x[1])

# 降序
sorted(dic.items(), key = lambda x:x[1],reverse =True)

微信公众号或网页自动导出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
https://juejin.im/post/5b4cc601f265da0f5a2545a9
https://github.com/MartinHan01/webpage2img
pip install pillow
init_filelist()
#首先初始化webdirver
driver = webdriver.Chrome()
#设置输出路径
dir = './result'

for item in filelist:
try:
#获取图片路径,标题,以及输出路径
#自动滚动,并截图保存
pic_path,title = save_url(driver, item, dir)
#开始合并我们刚刚截的所有图
package_picture(pic_path, os.path.abspath(dir), title)
except Exception as e :
print(e)
python crop.py

批量压缩图片

1
2
3
4
5
6
7
8
9
10
11
12
13
pip install --upgrade tinify

import tinify
import os

tinify.key = '此处填入你的key'
path = "xxx" # 图片存放的路径

for dirpath, dirs, files in os.walk(path):
for file in files:
imgpath = os.path.join(dirpath, file)
print("compressing ..."+ imgpath)
tinify.from_file(imgpath).to_file(imgpath)

重试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import random
from tenacity import retry
@retry
def do_something_unreliable():
if random.randint(0, 10) > 1:
raise IOError("Broken sauce, everything is hosed!!!111one")
else:
return "Awesome sauce!"
print(do_something_unreliable())

from tenacity import *
@retry(stop=(stop_after_delay(10) | stop_after_attempt(5)), wait=wait_fixed(2))
def stop_after_10_s_or_5_retries():
print("Stopping after 10 seconds or 5 retries")
raise Exception

重试5次,每次间隔10秒,重试前等待2
def func():
pass
for _ in range(0,100):
while True:
try:
func()
except SomeSpecificException:
continue
break
def verify_url(url):
import requests
try:
requests.get(url, timeout=10)
return True
except requests.exceptions.ConnectTimeout:
return False
def main():
for _ in range(5):
try:
if verify_url(''):
return
else:
continue
except KeyError:
continue
if __name__ == '__main__':
main()
https://zhangslob.github.io/2019/01/14/Python%E9%87%8D%E8%AF%95%E7%9A%84%E5%A4%9A%E9%87%8D%E6%96%B9%E6%B3%95/

登录GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import requests
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Host': 'github.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
s = requests.session()
s.headers.update(headers)
def get_token():
url = 'https://github.com/login'
response = s.get(url)
pat = 'name=\"authenticity_token\" value=\"(.*?)\"'
authenticity_token = re.findall(pat, response.text)[0]
return authenticity_token
def login(authenticity_token, account, password):
payload = {
'commit': 'Sign in',
'utf8': '\u2713',
'authenticity_token': authenticity_token,
'login': account,
'password': password,
}
url = 'https://github.com/session'
response = s.post(url, data=payload)
print(response)
# do whatever you want
if __name__ == '__main__':
account, password = 'account', 'password'
authenticity_token = get_token()
login(authenticity_token, account, password)

多线程和多进程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))

import concurrent.futures
import math
PRIMES = [
112272535095293,
112582705942171,
112272535095293,
115280095190773,
115797848077099,
1099726899285419]
def is_prime(n):
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1, 2):
if n % i == 0:
return False
return True
def main():
with concurrent.futures.ProcessPoolExecutor() as executor:
for number, prime in zip(PRIMES, executor.map(is_prime, PRIMES)):
print('%d is prime: %s' % (number, prime))
if __name__ == '__main__':
main()


https://zhangslob.github.io/2018/07/03/%E5%BF%AB%E9%80%9F%E5%86%99%E4%B8%80%E4%B8%AA%E7%88%AC%E8%99%AB/

Pandas 做数据分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import pandas as pd
import numpy as np

url = ('https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv')
tips = pd.read_csv(url)
output = tips.head()
>>> output
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4

sql 语句: SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;。

output = tips[['total_bill', 'tip', 'smoker', 'time']].head(5)

https://learnku.com/articles/29825

多线程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
线程 (Thread) 是操作系统能够进行运算调度的最小单位。它被包含在进程中,是进程中的实际运作单位。一个进程中可以并发多个线程,每条线程并行执行不同的任务。同一进程中的多个线程共享进程中的全部系统资源。
以下演示使用多线程对一个变量值进行修改,在循环的次数不多时修改后变量的值是符合预期的,当增加循环次数后,变量最终的值并不符合预期。由此可见:线程之间资源是存在竞争的,修改同一份资源必须加互斥锁,同时需要避免死锁。

# coding=utf-8
import threading

# 定义一个字段。多线程执行+1操作
balance = 0

def worker1():
global balance
for i in range(1000):
balance += 1
print('线程1执行完成,balance='+str(balance))

def worker2():
global balance
for i in range(1000):
balance += 1
print('线程2执行完成,balance='+str(balance))

def main():
# 构造线程对象
t1 = threading.Thread(target=worker1)
t2 = threading.Thread(target=worker2)
# 开始执行
t1.start()
t2.start()

"""
循环次数为1000时,程序输出:
线程1执行完成,balance=1000
线程2执行完成,balance=2000
循环次数为1000000时,程序输出:
线程1执行完成,balance=1180919
线程2执行完成,balance=1179703
"""

if __name__ == '__main__':
main()
要想解决以下的问题,需要使用线程的锁对象,只需要对 worker1 和 woker2 方法进行修改。

# 创建一个互斥锁,默认是未锁定状态
mutex = threading.Lock()

def worker1():
global balance
for i in range(1000000):
mutex.acquire()
balance += 1
mutex.release()
print('线程1执行完成,balance=' + str(balance))

def worker2():
global balance
for i in range(1000000):
mutex.acquire()
balance += 1
mutex.release()
print('线程2执行完成,balance=' + str(balance))

"""
加了互斥锁之后的输出:
线程1执行完成,balance=1941343
线程2执行完成,balance=2000000
"""
特点:

线程执行的顺序是不确定的
主线程【进程】会等待所有子线程结束后才会退出,主线程【进程】结束么子线程必然结束
线程间共享资源
修改资源必要时需要加锁,同时避免死锁
占用的资源比进程少
线程并不是越多越快
由于 GIL 的原因,多线程并不是真正的并发,只是交替执行
https://learnku.com/articles/29367

尾递归

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 例子代码
def tail_recursion(n, total=0):
if n == 0:
return total
else:
return tail_recursion(n-1, total+n)
# 执行结果:
tail_recursion(5)
tail_recursion(4, 5)
tail_recursion(3, 9)
tail_recursion(2, 12)
tail_recursion(1, 14)
tail_recursion(0, 15)
5+4+3+2+1=15

获取公众号全部文章

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
https://mp.weixin.qq.com/s/nkW2sYLcdsNTYTkk-4BeLA
import requests
import json
import time
from pymongo import MongoClient

url = 'http://mp.weixin.qq.com/mp/xxx'(公众号不让添加主页链接,xxx表示profile_ext)
#url='https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzIyMjg2ODExMA==&f=json&offset=21&count=10&is_ok=1&scene=124&uin=NjQ3OTQwMTAy&key=a90c16d3bbfeedd04adeeda7bfc81049f486e81712f95a347e33fccfb9fe00841ec6a4d0984ce32f72fe5e8c479fd13c6680b5496cda322ab1bb2b81417a10ae277a861ad580e77cdf78edbf86212c08&pass_ticket=2vonvdf3N4L67te2BCa4ZqvIs1ed2MoeBqoznvfNSL%2BeKqF4YgHUvNEWLNczZovz&wxtoken=&appmsg_token=1015_jLHC7BDStvidMqo9YO55XLerjoP9z6UM70Q5vw~~&x5=0&f=json'
# Mongo配置
conn = MongoClient('127.0.0.1', 27017)
db = conn.wx #连接wx数据库,没有则自动创建
mongo_wx = db.article #使用article集合,没有则自动创建

def get_wx_article(biz, uin, key, index=0, count=10):
offset = (index + 1) * count
params = {
'__biz': biz,
'uin': uin,
'key': key,
'offset': offset,
'count': count,
'action': 'getmsg',
'f': 'json'
}

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}

response = requests.get(url=url, params=params, headers=headers)
resp_json = response.json()
if resp_json.get('errmsg') == 'ok':
resp_json = response.json()
# 是否还有分页数据, 用于判断return的值
can_msg_continue = resp_json['can_msg_continue']
# 当前分页文章数
msg_count = resp_json['msg_count']
general_msg_list = json.loads(resp_json['general_msg_list'])
list = general_msg_list.get('list')
print(list, "**************")
for i in list:
app_msg_ext_info = i['app_msg_ext_info']
# 标题
title = app_msg_ext_info['title']
# 文章地址
content_url = app_msg_ext_info['content_url']
# 封面图
cover = app_msg_ext_info['cover']

# 发布时间
datetime = i['comm_msg_info']['datetime']
datetime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(datetime))

mongo_wx.insert({
'title': title,
'content_url': content_url,
'cover': cover,
'datetime': datetime
})
if can_msg_continue == 1:
return True
return False
else:
print('获取文章异常...')
return False


if __name__ == '__main__':
biz = 'Mzg4MTA2Nzg0NA=='
uin = 'NDIyMTI5NDM1'
key = '20a680e825f03f1e7f38f326772e54e7dc0fd02ffba17e92730ba3f0a0329c5ed310b0bd55b3c0b1f122e5896c6261df2eaea4036ab5a5d32dbdbcb0a638f5f3605cf1821decf486bb6eb4d92d36c620'
index = 0
while 1:
print(f'开始抓取公众号第{index + 1} 页文章.')
flag = get_wx_article(biz, uin, key, index=index)
# 防止和谐,暂停8秒
time.sleep(8)
index += 1
if not flag:
print('公众号文章已全部抓取完毕,退出程序.')
break

print(f'..........准备抓取公众号第{index + 1} 页文章.')

PHP 与 Python 代码对比

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-
n = 0
https://learnku.com/articles/30958
while n < 3:
#累计次数,用于循环条件
n = n + 1
#定义账号和密码
uname = 'tangqingsong'
pwd = '123123'
#接收参数
username = input('请输入用户名:')
password = input('请输入密码:')

#判断用户输入的账号和密码是否正确,正确将提示成功,并且退出循环体
if uname == username and pwd == password:
print ('恭喜你,登陆成功~')
break
#三次机会用完的时候,提示错误次数,并告知即将退出
elif n == 3:
print('已错误', n, '次,即将退出...')
#如果在三次以内,提示还剩下几次机会
else:
print('抱歉,账号或密码不正确,你还有', 3 - n, '次机会')

$n = 0;

while ($n < 3) {
#累计次数,用于循环条件
$n = $n + 1;
#定义账号和密码
$uname = 'tangqingsong';
$pwd = '123123';
#接收参数
fwrite(STDOUT, '请输入用户名:');
$username = trim(fgets(STDIN));
fwrite(STDOUT, '请输入密码:');
$password = trim(fgets(STDIN));

#判断用户输入的账号和密码是否正确,正确将提示成功,并且退出循环体
if ($uname == $username and $pwd == $password) {
print_r('恭喜你,登陆成功~');
break;
#三次机会用完的时候,提示错误次数,并告知即将退出
} elseif ($n == 3) {
print_r("已错误{$n}次,即将退出...");
} else {
#如果在三次以内,提示还剩下几次机会
$j = 3 - $n;
print_r("抱歉,账号或密码不正确,你还有{$j}次机会");
}
}

python2与3的编码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
Python2有两种表示字符序列的类型,分别叫做str和Unicode,str实例包含原始的8位值;而Unicode的实例,则包含Unicode字符。

str格式本质含义是“某种编码格式”,绝大多数情况下,被引号框起来的字符串,就是str,这时的字符串编码类型,其实就是你Python文件的编码类型,比如在Windows里,默认用的是GBK编码。
Unicode格式的含义就是“用unicode编码的字符串”。Python在进入2.0版后正式定义了了Unicode字符串这个奇怪的特性,目的就是为了处理太多种语言编码的文本。从那时开始,Python语言中的字符串类型就分为两种:一种是传统的Python字符串(各种花样编码),另一种则是新出现的Unicode
Python3也有两种表示字符序列的类型:bytes和str。前者的实例包含原始的8位值,后者的实例包含Unicode字符,可以说python3的str,就是python2的Unicode

str格式的定义变更为”Unicode类型的字符串“,也就是说在默认情况下,被引号框起来的字符串,是使用Unicode编码的。
而“不是Unicode的某种编码格式”,比如UTF-8、GBK,这些编码方式被定义为了bytes,这里的bytes和py2中的str有很多相似的地方
我们需要编写两个辅助(helper)函数,以便在这两种情况之间转换,使得转换后的输入数据能够符合开发者的预期

#在Python3中,我们需要编写接受str或bytes,并总是返回str的方法:
def to_str(bytes_or_str):
if isinstance(bytes_or_str, bytes):
value = bytes_or_str.decode('utf-8')
else:
value = bytes_or_str
return value # Instance of str

#另外,还需要编写接受str或bytes,并总是返回bytes的方法:
def to_bytes(bytes_or_str):
if isinstance(bytes_or_str, str):
value = bytes_or_str.encode('utf-8)
else:
value = bytes_or_str
return value # Instance of bytes

#在Python2中,需要编写接受str或unicode,并总是返回unicode的方法:
#python2
def to_unicode(unicode_or_str):
if isinstance(unicode_or_str, str):
value = unicode_or_str.decode('utf-8')
else:
value = unicode_or_str
return value # Instance of unicode

#另外,还需要编写接受str或unicode,并总是返回str的方法:
#Python2
def to_str(unicode_or_str):
if isinstance(unicode_or_str, unicode):
value = unicode_or_str.encode('utf-8')
else:
value = unicode_or_str
reutrn vlaue # Instance of str
https://xin053.github.io/2016/10/30/Python%E5%AD%A6%E4%B9%A0%E9%87%8D%E7%82%B9%E6%91%98%E8%AE%B0/

str包含一个encode方法,使用特定编码将该字符串其转换为一个bytes,这称之为编码
。bytes类包含了一个decode方法,也接受一个编码作为单个必要参数,并返回一个str,
这称之为解码。

s='n排球①’
b1=s.encode('utf-8'
b2=s.encode()
print(b1)
print(b2)
b\xcf\x80\xe6\x8e\x92\xe7\x9e\x83\xe3\×8l\xae'
b'\xcf\×80\xe6\x8e\x92\xe7\x9e\x83\xe3\x8l\xae'

inport sys
print(sys.platform)
print(sys.getdefaultencoding())
win32utf-8
可以看出我这个平台默认选择的就是utf-8编码方式。
b=b'\xe6lx8e\x92\xe7\x9e\x83'
s1=b.decode(encoding='utf-8")
s2=b.decode()
s3=b.decode(encoding='latin-1')
print(s1)
print(s2)
print(s3)
排球
排球
e2'cf

>>> '请'.encode('unicode-escape')
b'\\u8bf7'
>>> b'\u8bf7'.decode('unicode-escape')
'请'

s='apple'
b=b'apple'
print(b)
print(type(b))
print(s)
print(type(s))
b'apple'
<class'bytes'>
apple
<class'str'>
再近距离的看看bytes类型字节字符串,本质上它就是一串单字节16进制数b=b'apple' https://www.zhihu.com/question/35584979
print(b[0])
print(b[1:])
print(1ist(b))
97
b'pple'
[97,112,112,188,101]

s=' AABec'
with open(' utf-8data','w', encoding=' utf-8") as f: ' +
'f. urite(s)
with open(' utf-8data','r', encoding=' utf-8') as f:
u_str=f. read()
print(u_str)
AABeC
with open(' utf-8data",' rb') as f: byte_str=f. read()
print(byte_str)
print(byte_str. decode(encoding=' utf-8))
b'A\ XC3\X84B\ xC3\ xa8c'
AABeC

Win10 下 Python2 与 Python3 兼容问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
新旧2个文件,加入环境变量
python2.bat

@echo off

rename "C:\Program Files\Python37\python.exe" python.exe.disabled
rename "C:\Program Files\Python37\Scripts\pip.exe" pip.exe.disabled
python3.bat

@echo off

rename "C:\Program Files\Python37\python.exe.disabled" python.exe
rename "C:\Program Files\Python37\Scripts\pip.exe.disabled" pip.exe

https://learnku.com/articles/31141

剪刀、石头、布

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#剪刀、石头、布
import random
guess_list = ["石头", "剪刀", "布"]
win_combination = [["布", "石头"], ["石头", "剪刀"], ["剪刀", "布"]]
while True:
people = input('请输入:石头,剪刀,布\n').strip()
computer = random.choice(guess_list)
print('电脑出拳:'+computer)
if people not in guess_list:
print('咦~~弄啥类你!~~~')
continue
if computer == people:
print ('平手,再玩一次!')
elif [computer, people] in win_combination:
print ('电脑获胜!继续吧~~~')
else:
print ('你获胜!')
break
print('---------------------------------')

print('Press any key to exit');
input(); #防止控制台输出秒退

md5和sha1加密

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

import hashlib

data = 'This a md5 test!'
hash_md5 = hashlib.md5(data)

hash_md5.hexdigest()
MD5不仅仅是上面这个例子这样用来处理字符串,还有更广泛的用途:
加密网站注册用户的密码。 (但去年的各大网站密码泄漏事件确实让人蛋疼……)
网站用户上传图片 / 文件后,计算出MD5值作为文件名。(MD5可以保证唯一性)
key-value数据库中使用MD5值作为key。
比较两个文件是否相同。(大家在下载一些资源的时候,就会发现网站提供了MD5值,就是用来检测文件是否被篡改)
用MD5来检测两个文件是否相同,但想想,如果是两个很大的文件,担心内存不够用,这时怎么办?
这就要使用 update 方法了。代码如下:
import hashlib

def get_file_md5(f):
m = hashlib.md5()

while True:
data = f.read(10240)
if not data:
break

m.update(data)
return m.hexdigest()


with open(YOUR_FILE, 'r') as f:
file_md5 = get_file_md5(f)
(windows 用户 要使用 'rb'方式打开文件)
大家可以用下面这段代码验证一下:
import hashlib

x = hashlib.md5()
x.update('hello, ')
x.update('python')
x.hexdigest()

hashlib.md5('hello, python').hexdigest()
这两次的输出是一样的。
SHA1 也是一样的用法。https://p0sec.net/index.php/archives/33/

concat 组合 dataframe

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import pandas as pd
india_weather = pd.DataFrame({
'city': ['mumbai', 'delhi', 'banglore'],
'temperature': [32, 34, 30],
'humidity': [80, 60, 72]
})
us_weather = pd.DataFrame({
'city': ['newyork', 'chicago', 'orlando'],
'temperature': [21, 24, 32],
'humidity': [68, 65, 70]
})
df = pd.concat([india_weather, us_weather])
df = pd.concat([india_weather, us_weather], ignore_index=True)

df = pd.concat([india_weather, us_weather], keys=['india', 'us'])

df.loc['india']
df = pd.concat([temperature_df, windspeed_df], axis=1)
https://learnku.com/articles/26025

数学问题

1
2
3
4
5
6
7
8
9
数学问题:假如一个星球有 100 人,每年人数翻一倍。那么,多少年之后人数才有 100 万人
>>> p=100
>>> y=0
>>> while p<1000000:
... p*=2
... y+=1
...
>>> y
14

Python语法速查

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
http://www.ikeguang.com/2019/03/17/python-sytnax/
a = [1, 2]
b = a
print(id(a) - id(b)) # 地址差为 0,表示实质是同址的
0
b.append(3)
print(a) # 只改动了 b,但 a 也跟着变动了
[1, 2, 3]
a is b
True
使用切片来重新分配空间:


a is a[:]
False
运算两数中只要有一个浮点数,结果就是浮点数;
整数相除,即使能除尽,结果也是浮点数;
Python 内部的机制解决了整数溢出的问题,不用担心。
序列主要包括字符串(str)、列表(list)与元祖(tuple)三类。
>>> 'ab'.index('b')
1
>>> 'b' in 'ab'
True
>>> max([1,2,3])
3
s = " I love Python" # 首位是空格
lst = s.split(' ')
lst1 = '-'.join(lst)
strip() 去掉字符串首尾两端的空格。方法 lstrip()/rstrip() 则只切除首端/尾端的空格。
'I like {} and {}'.format('Python', 'you')
'I like Python and you'
'{0} + {2} = {1}'.format (10, 20, 'Python ') # 按顺序引用
'10 + Python = 20'
'{0} * {1} = {0}'.format (10, 'Python ') # 编号反复引用
'10 * Python = 10'

bing搜索

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import requests, re, time, webbrowser, codecs
print('==========搜索引擎==========')
time.sleep(0.7)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=E4CB65F3BD7F4EC7922E3642567A39EC&dmnchg=1; _EDGE_V=1; MUID=24CC781F18B266D70F9C758D199C670F; MUIDB=24CC781F18B266D70F9C758D199C670F; SRCHUSR=DOB=20190707&T=1562487393000; SNRHOP=I=&TS=; _EDGE_S=mkt=zh-cn&SID=2C85ED242A1D66051D4FE0B62B33673B; _SS=SID=2C85ED242A1D66051D4FE0B62B33673B&HV=1562490664; SRCHHPGUSR=CW=1089&CH=1742&DPR=1&UTC=480&WTS=63698084193&PR=3',
'DNT':'1',
'Host':'cn.bing.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}
try:
try:
while True:
web_ids = {}
test = []
search = input('输入搜索的内容:')
page = [11, 21, 31, 41, 51]

def catch():
global webs
id_times = 0
for t in page:
url = 'https://cn.bing.com/search?q=' + search + '&qs=n&sp=-1&pq&sc=0-5&sk=&cvid=275512403280414F9363B7CDC7368CBD&first=' + str(t) + '&FORM=PERE'
text = requests.get(url, headers=headers).text
a1 = '<h2>(.*?)</h2>'
a2 = 'href="(.*?)"'
a3 = '>(.*?)</a>'
title = re.findall(a1, text)
for j in title:
id_times = id_times + 1
title = re.findall(a3, j)
webs = re.findall(a2, j)
web_ids[id_times] = webs
print(id_times, title[0])
print('网址:%s' % webs[0])
print()
if len(webs) > 0:
test.append(1)

if len(test) > 0:
print('打开网址(如需打开多个网址,请用英文符号“,”,使用其它键默认不打开任何网址)')
open_web = input('网址编号:').split(',')
if 'n' not in open_web:
for aweb in open_web:
if aweb.isdigit():
ty = int(aweb)
if ty in web_ids:
web = web_ids[ty]
webbrowser.open(web[0])

else:
print('没有符合您的搜索结果!')
print('==============================')

catch()

except KeyboardInterrupt:
exit()

except requests.exceptions.ConnectionError:
print('抱歉,网络出现了一点问题!')//https://learnku.com/articles/32422
time.sleep(3)
exit()

requests-html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
pip install requests-html
from requests_html import HTMLSession
session = HTMLSession()
# GET请求
url = "http://kaoshi.edu.sina.com.cn/college/scorelist?tab=batch&wl=1&local=2&batch=&syear=2013"
r = session.get(url)
r.encoding='utf-8' # 解决中文乱码问题
print(r.text)
# 获取的网页的内容存储到本地
with open('test.html','wb') as f:
f.write(r.content)

# POST请求
url = 'https://shuju.wdzj.com/plat-info-target.html'
params = {'wdzjPlatId': 59,'type': 1, 'target1': 1, 'target2': 0}
r = session.post(url, params=params)
print(r.text)

###定制请求头
headers = {'user-agent': 'my-app/0.0.1'}
r = session.get(url, headers=headers)
r = session.get('http://www.w3school.com.cn')
print(r.html.links)

# 输出 (太多,中间省略部分)
{'http://www.w3ctech.com/', '/glossary/index.asp', '/html5/html5_quiz.asp', '/php/index.asp', '/asp/index.asp', '/php/php_ref_date.asp', 'http://wetest.qq.com/?from=links_w3school', '/asp/asp_ref.asp', '/tags/index.asp', '/xmldom/index.asp', '/example/csse_examples.asp', '/w.asp', '/index.html', 'http://weibo.com/w3schoolcomcn', '/ws.asp', '/b.asp', '/cssref/index.asp', '/jquerymobile/index.asp',
...
'/xsl/xsl_languages.asp',
'/example/html_examples.asp'}

# 获取绝对地址
t = r.html.absolute_links
print(t)
{'http://www.w3school.com.cn/media/index.asp',
'http://www.w3school.com.cn/glossary/index.asp',
'http://www.w3school.com.cn/php/php_ref.asp',
'http://www.w3school.com.cn/site/index.asp',
...
'http://www.w3school.com.cn/asp/asp_quiz.asp'}
# 获取3cschoool首页左侧的菜单列表 first=True表示找到的第一个‘HTML教程’
menuList = r.html.find('#navsecond > ul', first=True)
print(menuList.text)

# 输出
HTML
HTML5
XHTML
CSS
CSS3
TCP/IP

# 找出所有菜单的标题和链接
menuList = r.html.find('#navsecond > ul')
for menu in menuList:
print(menu.text) # 获得标题
print(menu.absolute_links) # 获得链接

# 输出
HTML
HTML5
XHTML

from requests_html import HTMLSession
import requests

session = HTMLSession()


# 背景图片地址
url = "http://www.win4000.com/wallpaper_2285_0_10_1.html"
r = session.get(url)

# 新建bg文件夹
if not os.path.exists('bg'):
os.mkdir('bg')

# 保存图片到bg/目录
def save_image(url, title):
img_response = requests.get(url)
with open('./bg/'+title+'.jpg', 'wb') as file:
file.write(img_response.content)

# 查找页面中图片列表,找到链接,
# 点击链接,访问查看大图,并获取大图地址pic-large
items_img = r.html.find('ul.clearfix > li > a')
for img in items_img:
img_url = img.attrs['href']
if "/wallpaper_detail" in img_url:
r = session.get(img_url) # 解析图片详情
item_img = r.html.find('img.pic-large', first=True)
url = item_img.attrs['src'] # 大图图片地址
title = item_img.attrs['title'] # 图片标题
print(url+title)
save_image(url, title)

http://www.golang365.com/#/blog/17

输出今天日期

1
2
3
4
5
6
import time

# 获取今天年月日
nowdate = time.localtime(time.time()) # 获得当前时间戳
today = time.strftime('%Y-%m-%d %H:%M:%S', nowdate) # 转换成指定格式
print(today)

保存json文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import io

sendData = [
{
id: 1,
name: '奥特曼'
},{
id: 2,
name: '小怪兽'
}
]

with io.open('data.json', 'w', encoding="utf-8") as file:
json.dump(sendData, file, ensure_ascii=False, sort_keys=True, indent=2)
print('保存成功')#http://www.golang365.com/#/blog/18

openCV

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
pip install --upgrade setuptools

pip install numpy Matplotlib

pip install opencv-python
如果多次下载失败,可以从 http://www.lfd.uci.edu/~gohlke/pythonlibs/ 直接下载whl包安装,安装whl包依然使用pip

#导入cv模块
import cv2 as cv
#读取图像,支持 bmp、jpg、png、tiff 等常用格式
img = cv.imread(r"E:\python\test.jpg")
#创建窗口并显示图像
cv.namedWindow("Image")
cv.imshow("Image",img)
cv.waitKey(0)
#释放窗口
cv2.destroyAllWindows()
http://www.golang365.com/#/blog/19

图片转pdf

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
pip install reportlab
import sys
from reportlab.pdfgen import canvas

# 生成多页pdf 生成一个3页的pdf文件
def texttopdf():
c = canvas.Canvas('text.pdf')
c.drawString(100, 100, "Some text in first page.")
c.showPage()
c.drawString(100, 100, "Some text in second page.")
c.showPage()
c.drawString(100, 100, "Some text in third page")
c.showPage()
c.save()

texttopdf()
print('转换成功!')

# 单张图片转pdf,图片不失真。比较清晰

import sys
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from PIL import Image

# 如果有输指定文件则转换参数里的图片,否则转换test.jpg文件
if len(sys.argv) > 1:
img = sys.argv[1]
filename = img.split('.')[0]
f_jpg = filename+'.jpg'
f_pdf = filename+'.pdf'
print(f_jpg)

else:
img = 'wechat.png'
f_pdf = 'test.pdf'


def imgtopdf():
(maxw, maxh) = Image.open(img).size
c = canvas.Canvas(f_pdf, pagesize=(maxw, maxh))
c.drawImage(img, 0, 0, maxw, maxh)
c.showPage()
c.save()


imgtopdf()
print('转换成功!')
https://github.com/sweida/python-study/tree/master/imgToPdf
http://www.golang365.com/#/blog/22

模拟登录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import requests
session = requests.session()

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}

def login():
url = 'http://119.29.27.100/apis/login'
data = {
'username': XXX,
'password': *******
}
response = session.post(url, data=data, headers=headers)

responseData = response.json()
if responseData['status']==1:
print('登录成功')
comment()
else:
print('登录失败', '失败原因:', responseData['msg'])


def comment():
url = 'http://119.29.27.100/apis/message/add'
data = {
'content': '这条应该是有登录的',
'ykname': ''
}
response = session.post(url, data=data, headers=headers)

responseData = response.json()
if responseData['status'] == 1:
print('留言成功')
else:
print('留言失败', '失败原因:', responseData['msg'])

login()

import requests

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}

# 利用浏览器登录后得到的cookie,
cookie_str = r'_myFavMv=%5B%5D; td_cookie=3034830472; laravel_session=eyJpdiI6Ik4wSjFSUU1wcFo1SndYRFliNWZZeXc9PSIsInZhbHVlIjoiMHJWZzM1WmpGRXp6NWVLYk9OaUdHOVVzcWRNK25lQ21lMFhIcmk4eUxKcEFMSnhwSDBMbTFyM3duUllqT3IycGRIc3V2TGhzWEdWaytWRkpzT3hNelE9PSIsIm1hYyI6ImNiMjRhMGFiYTIxYWJhMjUwZDJlNGI2ODgzY2ZiYzY4ZGY4NzI0MDQ4OGZkN2RiNGIwZGM2M2I2YmExYmY3OGIifQ%3D%3D'

#把cookie字符串处理成字典,以便接下来使用
cookies = {}
for line in cookie_str.split(';'):
key, value = line.split('=', 1)
cookies[key] = value

def comment():
url = 'http://119.29.27.100/apis/message/add'
data = {
'content': '再试一条cookie请求',
'ykname': ''
}
response = requests.post(url, data=data, headers=headers, cookies=cookies)

responseData = response.json()
# print(responseData)
if responseData['status'] == 1:
print('留言成功')
else:
print('留言失败', '失败原因:', responseData['msg'])

comment()
http://www.golang365.com/#/blog/54

猫眼票房

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
import time
import datetime
import base64

import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
from fontTools.ttLib import TTFont

font = TTFont('font1.woff')
uni_list = font.getGlyphOrder()[2:]
first_match = {
'uniE893': '0',
'uniF690': '1',
'uniF55C': '2',
'uniF28F': '3',
'uniF4B1': '4',
'uniE623': '5',
'uniF294': '6',
'uniEEC4': '7',
'uniE577': '8',
'uniE77B': '9'
}

def get_one_page(date):
headers = {
'User-Agent': os.getenv('User_Agent')
}
url = 'https://piaofang.maoyan.com/?ver=normal&date=' + date
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except RequestException as e:
print("Requests {}, Error {}.".format(date, e.args))
return None

def parse_font(html):
fonts = re.findall(r'base64,(.*?)\)', html, re.S)[0]
# fonts = re.search(r'base64,(.*?)\)', html, re.S)
fonts = base64.b64decode(fonts)
with open('tmp.woff', 'wb') as fp:
fp.write(fonts)
font1 = TTFont('tmp.woff')
# obj_list1 = font1.getGlyphNames()[1:-1]
uni_list1 = font1.getGlyphOrder()[2:]
tmp_match = {}
for uni1 in uni_list1:
obj1 = font1['glyf'][uni1] #获取编码 uni1 在 tmp.ttf 中对应的对象
for uni in uni_list:
obj = font['glyf'][uni]
if obj==obj1:
tmp_match[uni1] = first_match[uni]
return tmp_match

def rebuild_number(number, tmp_match):
'''还需要对数字进行改写'''
result = ''
for num in number:
s = str(hex(ord(num)))
s = s.upper().replace('0X', 'uni')
if s in tmp_match.keys():
result += tmp_match[s]
else:
result += num
return result

def parse_one_page(html):
tmp_match = parse_font(html)
doc = pq(html)
today = doc('.today').text()[:10]
movies = doc('#ticket_tbody ul').items()
for movie in movies:
result = {}
result['date'] = today
result['movieName'] = movie.find('.c1 b').text()
result['releaseInfo'] = movie.find('.c1 em').text().split()[0]
result['sumBoxInfo'] = rebuild_number(movie.find('.c1 em i').text(), tmp_match)
result['boxInfo'] = rebuild_number(movie.find('.c2').text(), tmp_match)
result['boxRate'] = rebuild_number(movie.find('.c3').text(), tmp_match)
result['showRate'] = rebuild_number(movie.find('.c4').text(), tmp_match)
result['avgSeatView'] = rebuild_number(movie.find('.c5').text(), tmp_match)
yield result

def main():
start_date = datetime.date.today()
for i in range(0, 31):
date = start_date - datetime.timedelta(days=i)
html = get_one_page(date.isoformat())
for result in parse_one_page(html):
print(result)
time.sleep(1)

if __name__ == '__main__':
main()
https://learnku.com/articles/32534#reply104205

锟斤拷

1
2
3
4
5
>>> s = (u'\uFFFD'.encode('utf8')*2)
>>> print(s.decode('gbk'))
锟斤拷
当unicode遇到解释失败的字时,会尝试用 「U+FFFD」 来代替,「U+FFFD」乃是 unicode 的一个占位符, 显示为 �
http://cuihuan.net/2019/05/12/%E5%AD%97%E7%AC%A6%E7%BC%96%E7%A0%81%E9%82%A3%E4%BA%9B%E4%BA%8B%E5%84%BF/

###

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
pip intall nonude
import nude
print(nude.is_nude("godfather.jpg"))
print(nude.is_nude("leisheng.jpg"))
print(nude.is_nude("qiaoba.png"))

import glob
import itertools
from nude import Nude

images_format = ['jpg', 'png', 'gif'] # 图片格式
images_jpg = glob.glob("E:/Images/OOXX/*.jpg") # 返回匹配指定模式的文件名
images_png = glob.glob("E:/Images/OOXX/*.png")
images_gif = glob.glob("E:/Images/OOXX/*.gif")

images_list = itertools.chain(images_jpg, images_png, images_gif)

for i in images_list:
print(i) # 输出照片的路径
n = Nude(i) # 对图片进行识别
n.parse()
print(n.result) # 输出结果
print(n.message) # 输出判断信息
print(n.inspect()) # 输出更加详细的判断信息


原文: https://lbjheiheihei.xyz/2018/05/14/Use-Python-Identifying-Porngraphic-Images.html

安装scrapy失败

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
依次安装lxml、pyOpenSSL、Twisted、pywin32 这些基本库都要安装好。

pip install lxml



如果不行,则去下面的网站下载。

https://pypi.org/project/lxml/#files
https://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml
比如我这台电脑是 Python3.632位的就下载 lxml-4.2.1-cp36-cp36m-win32.whl 进入 cmd,然后 cd 到文件的路径下,接着就是

pip install lxml-4.2.1-cp36-cp36m-win32.whl

命令后面那一部分要和文件名保持一致,也就是 pip install 文件名.whl回车,等一会就安装好了
pip install lxml
PythonCopy


如果不行,则去下面的网站下载。

https://pypi.org/project/lxml/#files https://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml
比如我这台电脑是 Python3.632位的就下载 lxml-4.2.1-cp36-cp36m-win32.whl 进入 cmd,然后 cd 到文件的路径下,接着就是

pip install lxml-4.2.1-cp36-cp36m-win32.whl
PythonCopy
命令后面那一部分要和文件名保持一致,也就是 pip install 文件名.whl回车,等一会就安装好了


原文: https://lbjheiheihei.xyz/2018/05/27/Install-Scrapy-In-Window.html

ChromeDriver

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
https://sites.google.com/a/chromium.org/chromedriver/downloads http://phantomjs.org/
import time
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("--incognito") # 隐身模式打开
driver_path = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" # chromedriver.exe 的路径
browser = webdriver.Chrome(executable_path=driver_path, options=options)
browser.get("https://kejibear.xyz/auth/login") # 网址
browser.find_element_by_css_selector(".card-inner input[name='Email']").send_keys("@qq.com") # 账号
browser.find_element_by_css_selector(".card-inner input[name='Password']").send_keys("1") # 密码
browser.find_element_by_css_selector(".row .col-md-10.col-md-push-1 button.waves-effect").click()
print("登录成功~")
time.sleep(3)
browser.find_element_by_css_selector(".card-action-btn #checkin-btn button.waves-effect").click()
print("签到成功~")
time.sleep(5)
browser.close()

生成彩色动态二维码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
pip install myqr
from MyQR import myqr
version, level, qr_name = myqr.run(
words='dhb cdfb64%vjk', # 不支持中文,支持 0~9,a~z, A~Z 以及常见的常用英文标点符号和空格
version=2, # 版本,从 1至 40
level='H', # 纠错等级,范围是L、M、Q、H,从左到右依次升高
picture='4e.jpg', # 文件要放在目录下
colorized=True, # True 为彩色,False 为黑白
contrast=1.0, # 对比度
brightness=1.0, # 亮度
save_name='1d6.bmp', # 命名随便都行,格式可以是 jpg,png,bmp,gif
save_dir="F:\二维码" # 路径要存在
)

myqr 666 -p 666.png -c
https://lbjheiheihei.xyz/2018/04/26/Use-Python-Generate-Colorful-QRcode.html

爬取简书用户的动态

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
from lxml import etree

my_header = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"

res = requests.get(url='https://www.jianshu.com/users/5aa8494a18c8/timeline', headers={'user-agent': my_header})

if '大神带我来搬砖' in res.text:
print('found')
page = etree.HTML(res.text)
last_li = page.xpath('''//ul[@class="note-list"]/li[last()]''')[0]
max_id = int(last_li.get('id').split('-')[1]) - 1

file = open("activity.txt",'w',encoding='utf-8')

page = 2
while True:
res = requests.get(url='https://www.jianshu.com/users/5aa8494a18c8/timeline?max_id=%s&page=%s' %(max_id,page),
headers={'user-agent': my_header, 'X-INFINITESCROLL':'true'})

last_li = etree.HTML(res.text).xpath('''/html/body/li[last()]''')[0]
max_id = int(last_li.get('id').split('-')[1]) - 1
page = page + 1
file.write(res.text)
file.write("\n")
if '加入了简书' in res.text:
print('end')
break

file.close()
#https://www.jianshu.com/p/35a85ee14f7b

from selenium import webdriver
import time

options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images":2}
options.add_experimental_option("prefs",prefs)
browser = webdriver.Chrome(chrome_options=options)
browser.set_page_load_timeout(60)

browser.get("https://www.jianshu.com/users/5aa8494a18c8/timeline")
time.sleep(5)

file = open("browser.txt",'w',encoding='utf-8')

while True:
text = browser.find_element_by_xpath("""//*[@id="list-container"]/ul""").text
file.write(text)
# remove li elements
js='''var nodeList=document.querySelectorAll("#list-container > ul > li");for(var i=0;i<nodeList.length-1;i++){nodeList[i].remove()}'''
browser.execute_script(js)

# scroll
browser.execute_script("document.documentElement.scrollTop=0")
browser.execute_script("document.documentElement.scrollTop=1600")
time.sleep(10)

if '加入了简书' in text:
print("end")
break

file.write(text)
file.close()

retrying 重试请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
pip install retrying
import requests
from retrying import retry

headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}

@retry(stop_max_attempt_number=3) # 表示重试以下代码三次
def _parse_url(url):
print("-" * 30)
response = requests.get(url, headers=headers, timeout=3)
assert response.status_code == 200
return response.content.decode()

def parse_url(url):
try:
html_str = _parse_url(url)
except:
html_str = None
return html_str

if __name__ == "__main__":
url = 'www.baidu.com'
print(parse_url(url))

https://learnku.com/articles/33001

ipython

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
使用 Tab 实现自动补全功能。
使用问号?呈现对象的说明。
提供了一系列以 % 开头的魔术命令为常见任务的执行提供便利。
提供了额外的绘图功能。
可使用操作系统中的命令。
用 %quickref 查看 IPython 的参考手册,也可以使用 %magic 来查看 IPython 中魔术命令的信息,以更好的使用 IPython 工具。
最近两次执行的结果 (Out) 分别保存在 _ 及 __ 变量中。而且,IPython 还采用 _iN, _N 的方式将所有输入及输出的历史记录进行保存~其中 N 为行号

In [8]: eval(_i4) # eval('1 + 7*9')
Out[8]: 64
IPython 中执行过的命令,可通过 %logstart 将之保存在 Python 文件中
In [22]: !ls
IPython.md data_science.md pandas.ipynb
Jupyter.ipynb README.md

IPython 同样提供了一些代码分析工具,如代码执行时间的分析工具 %time 和 %timeit、基本的性能分析 %prun 和 %run -p 等

%time:整体执行一次,给出执行时间;
%timeit:多次执行,给出平均时间。
In [42]: %time 'foobar'.startswith('foo')
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
Out[42]: True
https://learnku.com/articles/33122#reply105653

列表中出现次数最多的数

1
2
3

test = [1, 2, 3, 4, 2, 2, 3, 1, 4, 4, 4, 4]
print(max(set(test), key=test.count))

pygame 模块

1
2
3
4
5
6
7
8
import pygame
from pygame import*
pygame.init()
size = (600,400) #窗口的大小
screem = pygame.display.set_mode(size) #把窗口的大小放进pygame.display.set_mode()函数里进行创建。
screem.fill((250,250,250))
写好了,但是窗口还是黑色,那是因为窗口还没刷新,用 pygame.display.update () 来进行刷新
https://learnku.com/articles/33209#reply105837

发送html格式邮件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/python
# -*- coding: utf-8 -*-
import smtplib, time, os
from email.mime.text import MIMEText
from email.header import Header

def send_mail_html(file):
sender = 'admin@jinchuang.org' #发件人
receiver = 'gaojing@jinchuang.org' #收件人
t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) #获取当前时间
subject = '博客磁盘使用报警_' + t #邮件主题
smtpserver = 'smtp.exmail.qq.com' #发送服务器地址
username = 'admin@jinchuang.org' #用户名
password = 'passwd' #密码

f = open(file, 'rb')
mail_body = f.read()
f.close()


msg = MIMEText(mail_body, _subtype='html', _charset='utf-8')
msg['Subject'] = Header(subject, 'utf-8')
msg['From'] = sender
msg['To'] = receiver

try:
smtp = smtplib.SMTP()
smtp.connect(smtpserver)
smtp.login(username, password)
smtp.sendmail(sender, receiver, msg.as_string())
except:
print("邮件发送失败!")
else:
print("邮件发送成功!")
finally:
smtp.quit()

file = '/tmp/df.html' #html文件
send_mail_html(file)

html表格模板
#!/bin/bash
ip=`ifconfig |grep -v 127 |grep inet|awk '{print $2}'`
a=`df -hT|grep -w "/"|awk '{print $1}'`
b=`df -hT|grep -w "/"|awk '{print $2}'`
c=`df -hT|grep -w "/"|awk '{print $3}'`
d=`df -hT|grep -w "/"|awk '{print $4}'`
e=`df -hT|grep -w "/"|awk '{print $5}'`
f=`df -hT|grep -w "/"|awk '{print $6}'`
g=`df -hT|grep -w "/"|awk '{print $7}'`

html="<html>
<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">
<head>
<style type=\"text/css\">
table{margin-top:5%;width:500px}
table.gridtable {
font-family: verdana,arial,sans-serif;
font-size:14px;
color:#333333;
border-width: 1px;
border-color: #666666;
border-collapse: collapse;
}
table.gridtable th {
border-width: 1px;
padding: 8px;
background-color: #008eff;
color:#fff;
}
table.gridtable td {
border-width: 1px;
padding: 8px;
border-style: solid;
border-color: #afafaf;
background-color: #ffffff;
}
table tr:first-child td:first-child, table tr:first-child th:first-child{
border-top-left-radius: 5px;
}
table tr:first-child td:last-child, table tr:first-child th:last-child{
border-top-right-radius: 5px;
}
</style>
</head>
<body>
<table class=\"gridtable\" align=center>
<tr>
<th colspan="7">告警主机:192.168.11.1</th>
</tr>
<tr>
<td>文件系统</td><td>类型</td><td>总共</td><td>已用</td><td>可用</td><td>使用率</td><td>挂载点</td>
</tr>
<tr>
<td>$a</td><td>$b</td><td>$c</td><td>$d</td><td>$e</td><td style=\"color:red;font-weight:bold\">$f</td><td>$g</td>
</tr>
</table>
</body>
</html>"
echo -e "$html" >/tmp/df.html

如果需要使用ssl协议,修改2个地方
smtpserver = 'smtp.exmail.qq.com:465' #加上465端口
smtp = smtplib.SMTP_SSL() #加上ssl
https://me.jinchuang.org/archives/272.html

pdf

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pdfkit

pdfkit.from_file('jianshu.htm','out.pdf')
pdfkit.from_string('HelloWorld','out.pdf')

with open('jianshu.htm','r') as f:
pdfkit.from_file(f,'out.pdf')
pdfkit.from_url(['https://www.jianshu.com/','https://www.baidu.com/'],'out.pdf')

#pdfkit.from_file(['jianshu.htm','jianshu1.htm'],'out.pdf')

options={
'page-size':'A4',#Letter
'margin-top':'0.75in',
'margin-right':'0.75in',
'margin-bottom':'0.75in',
'margin-left':'0.75in',
'encoding':"UTF-8",
'no-outline':None
}
pdfkit.from_url('https://www.jianshu.com/','out1.pdf', options=options)

https://juejin.im/post/5ce69794e51d4577523f22ef

天善博客内容如何转成PDF文档

1
2
3
4
5
6
7
wkhtmltopdf  'http://www.flybi.net/blog/seng/3645' 'http://www.flybi.net/blog/seng/3599'  sengblog.pdf

wkhtmltopdf --javascript-delay 2000 'http://www.flybi.net/blog/seng/3645' 'http://www.flybi.net/blog/seng/3599' sengblog.pdf

wkhtmltopdf --dump-outline out.xsl toc 'http://www.flybi.net/blog/seng/3645' 'http://www.flybi.net/blog/seng/3599' sengblog.pdf

https://ask.hellobi.com/blog/seng/3691

如何批量添加图片水印

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
python 利用opencv去除图片水印 https://mp.weixin.qq.com/s/BqeBk0oPP1KpueviCwZFFQ

https://mp.weixin.qq.com/s/QnMzvq_VWs2HyKHhD4FxQg
import os,traceback
from PIL import Image

# 获取文件夹图片
def get_folder(fpath,wm_file,save_path):
try:
img_suffix_list = ['png', 'jpg', 'bmp']
for i in os.listdir(fpath):
if i.split('.')[-1] in img_suffix_list:
img_path = fpath + '/' + i
img_water_mark(img_file=img_path,wm_file=wm_file,save_path=save_path)
except Exception as e:
print(traceback.print_exc())

# 图片添加水印
def img_water_mark(img_file, wm_file,save_path):
try:
img = Image.open(img_file) # 打开图片
watermark = Image.open(wm_file) # 打开水印
img_size = img.size
wm_size = watermark.size
# 如果图片大小小于水印大小
if img_size[0] &lt; wm_size[0]:
watermark.resize(tuple(map(lambda x: int(x * 0.5), watermark.size)))
print('图片大小:', img_size)
wm_position = (img_size[0]-wm_size[0],img_size[1]-wm_size[1]) # 默认设定水印位置为右下角
layer = Image.new('RGBA', img.size) # 新建一个图层
layer.paste(watermark, wm_position) # 将水印图片添加到图层上
mark_img = Image.composite(layer, img, layer)
new_file_name = '/new_'+img_file.split('/')[-1]
mark_img.save(save_path + new_file_name)
except Exception as e:
print(traceback.print_exc())

爬虫简书

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from lxml import etree

my_header = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"

res = requests.get(url='https://www.jianshu.com/users/5aa8494a18c8/timeline', headers={'user-agent': my_header})

if '666' in res.text:
print('found')
page = etree.HTML(res.text)
last_li = page.xpath('''//ul[@class="note-list"]/li[last()]''')[0]
max_id = int(last_li.get('id').split('-')[1]) - 1

file = open("activity.txt",'w',encoding='utf-8')

page = 2
while True:
res = requests.get(url='https://www.jianshu.com/users/5aa8494a18c8/timeline?max_id=%s&page=%s' %(max_id,page),
headers={'user-agent': my_header, 'X-INFINITESCROLL':'true'})

last_li = etree.HTML(res.text).xpath('''/html/body/li[last()]''')[0]
max_id = int(last_li.get('id').split('-')[1]) - 1
page = page + 1
file.write(res.text)
file.write("\n")
if '加入了简书' in res.text:
print('end')
break

file.close()
https://www.jianshu.com/p/35a85ee14f7b

markdown 转PDF

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
pip install markdown 
markdown.markdown() 函数就可以读取 md 文件里的内容了
先转HTML
import markdown
import os
import codecs
'''
savepath = "F:\RenZhengfei"
os.chdir(savepath)
file = codecs.open("README.md", mode="r", encoding="utf-8")
text = file.read()

html = markdown.markdown(text)
print(html)
with open('file_name.html', 'w') as f:
f.write(html)
'''

head = """<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<style type="text/css">
code {
color: inherit;
background-color: rgba(0, 0, 0, 0.05);
}
</style>
</head>
<body>
"""

foot = """
</body>
</html>
"""
filepath = "F:\RenZhengfei-master\ALL"
savepath = "F:\RenZhengfei-master\ALL-html"
if not os.path.isdir(savepath):
os.mkdir(savepath)
os.chdir(savepath)

i = 0
pathDir = os.listdir(filepath)
for allDir in pathDir:
if (allDir == "pdf"):
continue
name = allDir
print(name)

os.chdir(filepath)
fp1 = codecs.open(name, mode="r", encoding="utf-8")
text = fp1.read()
html = markdown.markdown(text)
fp1.close()
#print(html)

fname = name.replace('md', 'html')

#f2 = '%s.html' % (fname)
os.chdir(savepath)
fp2 = codecs.open(fname, "w", encoding="utf-8", errors="xmlcharrefreplace")
fp2.write(head + html + foot)
fp2.close()

print(i)

https://wemp.app/posts/6f807ecf-9ebd-4449-b419-2cfbf8c2e41f

import time
import pdfkit
import os

wk_path = r'E:\Program Files\wkhtmltox\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=wk_path)

filepath = "F:\RenZhengfei-master\ALL-html"
savepath = "F:\RenZhengfei-master\ALL-pdf"
time1 = time.time()
pathDir = os.listdir(filepath)
for allDir in pathDir:
if (allDir == "pdf"):
continue
name = allDir
print(name)
htmlpath=filepath+"\\"+name
print(htmlpath)
name = name.replace('html', 'pdf')
os.chdir(savepath)
pdfkit.from_url(htmlpath, name, configuration=config)


#pdfkit.from_url(url, name, configuration=config)
time2 = time.time()
print(str(time2 - time1)+" s")

验证身份证号

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import time

#生成出生当年所有日期
def dateRange(year):
fmt = '%Y-%m-%d'
bgn = int(time.mktime(time.strptime(year+'-01-01',fmt)))
end = int(time.mktime(time.strptime(year+'-12-31',fmt)))
list_date = [time.strftime(fmt,time.localtime(i)) for i in range(bgn,end+1,3600*24)]
return [i.replace('-','') for i in list_date]

data_time = dateRange('1993')


from id_validator import validator

#遍历所有日期,print通过校验的身份证号码
pip install id-validator
def vali_dator(id1,id2,id3):
for i in dateRange(id2):
theid = id1 + i + id3
if validator.is_valid(theid):
print(theid)

vali_dator('330221','1993','4914')

https://mp.weixin.qq.com/s?__biz=MzU5MjI3NzIxMw==&mid=2247486816&idx=1&sn=baa976db515e3b9b99e7001daa9a577a&chksm=fe2376d2c954ffc486625e5420e3ebcf3d83581986b0568b804fb5a54e4aaa032b4992c13905&mpshare=1&scene=1&srcid=1023PX0DRWmDc5E8oEZSVUx6&sharer_sharetime=1571795782903&sharer_shareid=43165518fc08bc947dca48788293333a&key=6f23511bf9e1c01f4c78d4f8f46e1b1e8fc6e548405a6029e3b015de7441c1527cd4817fc238470a3211f36f03178e6f7f9888d5f7d1ee5e6ef6b0b0fced5da2f45aa739e184ae5749a86f5102efd4f9&ascene=1&uin=NjQ3OTQwMTAy&devicetype=Windows+7&version=62070152&lang=zh_CN&pass_ticket=Nl73k%2FpmXYhrLnAbsjSStmagh1FEZZkB8fhtyVf9%2BmzY8foNNpPw%2FmaVHa2zPKdu
#print(validator.get_info('330221199306084914'))
https://github.com/zpw1995/aotodata/blob/master/interest/ID_card/ID_card.py

NumPy 基础

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
NumPy 的主要对象是多维数组 Ndarray。在 NumPy 中维度(dimensions)叫做轴(axes),轴的个数叫做秩(rank)
>>> np.array([1, 2, 3])
array([1, 2, 3])
>>> np.array([(1, 2, 3), (4, 5, 6)])
array([[1, 2, 3],
[4, 5, 6]])
>>> np.zeros((3, 3))
array([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])
>>> np.arange(5)
array([0, 1, 2, 3, 4])
>>> np.arange(6).reshape(2, 3)
array([[0, 1, 2],
[3, 4, 5]])
>>> np.random.rand(2, 3)
array([[0.50122984, 0.98824375, 0.81388012],
[0.60951775, 0.02055326, 0.97622093]])
>>> np.random.randint(5, size=(2, 3))
array([[2, 0, 2],
[4, 4, 4]])

https://learnku.com/articles/35684
>>> a = np.array([1, 2, 3, 4, 5])
>>> b = np.arange(1, 6)
>>> a, b
(array([1, 2, 3, 4, 5]), array([1, 2, 3, 4, 5]))

###

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
爬取豆瓣电影TOP250
import requests
from bs4 import BeautifulSoup

url = 'https://movie.douban.com/top250'
# 使用U-A伪装成浏览器发送请求
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
# 先使用requests发送网络请求从而获取网页
r = requests.get('https://movie.douban.com/top250', headers=headers)
# 使用bs4解析获取的网页
soup = BeautifulSoup(r.text, 'html.parser')
# 调用prettify()方法来使解析的HTML更加规范化
print(soup.prettify())
movie_list = soup.find('ol', attrs={'class': 'grid_view'}) #电影列表

for movie in movie_list.find_all('li'):
movie_name = movie.find('span', attrs={'class': 'title'})
print(movie_name.get_text())

肖申克的救赎
霸王别姬
这个杀手不太冷
阿甘正传
美丽人生
import codecs
import requests
from bs4 import BeautifulSoup

DOWNLOAD_URL = 'https://movie.douban.com/top250'

def download_page(url):
return requests.get(url, headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}).content

def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
# 电影列表
movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
movie_name_list = []
for movie_li in movie_list_soup.find_all('li'):
movie_name = movie_li.find('span', attrs={'class': 'title'}).get_text()
movie_info = movie_li.find('div', attrs={'class': 'bd'}).find('p').get_text()
movie_star = movie_li.find('span', attrs={'class': 'rating_num'}).get_text()
movie_name_list.append(movie_name)
movie_name_list.append(movie_info)
movie_name_list.append(movie_star)
# 下一页链接
next_page = soup.find('span', attrs={'class': 'next'}).find('a')
if next_page:
return movie_name_list,DOWNLOAD_URL + next_page['href']
return movie_name_list, None

def main():
url = DOWNLOAD_URL
with codecs.open('movies','wb', encoding='utf-8') as f:
while True:
html = download_page(url)
movies, url =parse_html(html)
f.write(u'{movies}\n'.format(movies='\n'.join(movies)))

if __name__ == '__main__':
main()
https://www.jianshu.com/p/8a460be5a26e

把bmp和png转换成jpg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import os
from PIL import Image

for root, dirs, files in os.walk("."):
for bmpfig in files:
if not bmpfig.endswith('.bmp') and not bmpfig.endswith('.png'):
continue
bmpfig = os.path.join(root, bmpfig)
newfigname = bmpfig[:-4] + ".jpg"
print "converting from", bmpfig, "to", newfigname
img = Image.open(bmpfig)
img = img.convert('RGB') # for png
img.save(newfigname, format='jpeg', quality=95)
img.close()
os.remove(bmpfig)
https://zjyfdu.github.io/2018/08/16/python%E6%8A%8Abmp%E8%BD%AC%E6%8D%A2%E6%88%90jpg/

numpy

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import numpy as np
>>> print(np.__version__)
1.16.2
>>> np.array([1, 2, 3])
array([1, 2, 3])
>>> np.arange(5)
array([0, 1, 2, 3, 4])
>>> np.arange(6).reshape(2, 3)
array([[0, 1, 2],
[3, 4, 5]])
>>> np.random.rand(2, 3)
array([[0.50122984, 0.98824375, 0.81388012],
[0.60951775, 0.02055326, 0.97622093]])
>>> np.random.randint(5, size=(2, 3))
array([[2, 0, 2],
[4, 4, 4]])

>>> a = np.array([1, 2, 3, 4, 5])
>>> b = np.arange(1, 6)
>>> a + b
array([ 2, 4, 6, 8, 10])
>>> np.sin(a)
array([-0.54402111, 0.91294525, -0.98803162, 0.74511316, -0.26237485])

>>> np.sqrt(a)
array([3.16227766, 4.47213595, 5.47722558, 6.32455532, 7.07106781])
>>> a ** 0.5 # 等价于np.sqrt(a)
array([3.16227766, 4.47213595, 5.47722558, 6.32455532, 7.07106781])
>>> np.power(a, 3)
array([ 1000, 8000, 27000, 64000, 125000])
>>> a ** 3 # 等价于np.power(a, 3)
https://learnku.com/articles/35686

pdf to image

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
https://github.com/freedesktop/poppler  
pdftoppm -singlefile -f 4 -r 72 -jpeg -jpegopt quality=90 presentation.pdf test_poppler

pdftoppm -f 1 -r 72 -jpeg -jpegopt quality=90 test_20191120_134947.pdf test_poppler
转换所有 生成多个图片
from pdf2image import convert_from_path

def main():
pages = convert_from_path("presentation.pdf", first_page=2,
single_file=True)
pages[0].save("test_pdf2image.jpg", quality=85)

if __name__ == "__main__":
main()
https://jdhao.github.io/2019/11/14/convert_pdf_to_images_pdftoppm/
https://imagemagick.org/script/download.php
convert -density 150 presentation.pdf -quality 90 output-%3d.jpg
https://jdhao.github.io/2019/11/20/convert_pdf_to_image_imagemagick/#convert-all-pages-of-pdf-file-to-images

图片exif

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
https://www.irfanview.com/
from PIL import Image
from PIL. from PIL.ExifTags import TAGS

img = Image.open('test.jpg')

exif = img.getexif()

for k, v in exif.items():
print('{}: {}'.format(TAGS[k], v))
from PIL import Image
import piexif

img = Image.open('test.jpg')
if "exif" in img.info:
exif_dict = piexif.load(img.info['exif'])

if piexif.ImageIFD.Orientation in exif_dict['0th']:
exif_dict['0th'][pixeif.ImageIFD.Orientation] = 3

# quick and dirty work around to avoid type error
exif_dict['Exif'][41729] = b'1'

exif_bytes = piexif.dump(exif_dict)

img.save('new_img.jpg', exif=exif_bytes)
https://jdhao.github.io/2019/07/31/image_rotation_exif_info/

命令行里处理数据科学问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
curl -o data_dl.csv https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data
Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),"whether he/she donated blood in March 2007"
2 ,50,12500,98 ,1
0 ,13,3250,28 ,1
1 ,16,4000,35 ,1
2 ,20,5000,45 ,1
1 ,24,6000,77 ,0

pip install csvkit
csvclean data_dl.csv
csvcut -n data_dl_out.csv | cut -c6-
Recency (months)
Frequency (times)
Monetary (c.c. blood)
Time (months)
whether he/she donated blood in March 2007

csvstat --mean data_dl_out.csv
1. a: 373.5
2. Recency (months): 9.507
3. Frequency (times): 5.515
4. Monetary (c.c. blood): 1,378.676
5. Time (months): 34.282
6. whether he/she donated blood in March 2007: None

import pandas as pd
data = pd.read_csv('data_dl_out.csv')
data.head()

data = data.rename(columns={'Recency (months)': 'recency',
'Frequency (times)': 'frequency',
'Monetary (c.c. blood)': 'volumne',
'Time (months)': 'time',
'whether he/she donated blood in March 2007': 'target'})
data.to_csv('data_clean.csv')

recency frequency volumne time target
0 2 50 12500 98 1
1 0 13 3250 28 1
2 1 16 4000 35 1
3 2 20 5000 45 1

csvsql --query "select frequency, count(*) as rows from data_clean where target = 1 group by frequency order by 2 desc" data_clean.csv
d:\python\lib\site-packages\win32\lib\pywintypes.py:2: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
d:\python\lib\site-packages\agate\utils.py:276: UnnamedColumnWarning: Column 0 has no name. Using "a".
frequency,rows
1.0,20
5.0,20
2.0,19
6.0,17
https://oicebot.github.io/2019/07/25/five-command-line-tools-for-data-science.html

Pandas 做数据分析sql

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import numpy as np

url = ('https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv')
tips = pd.read_csv(url)
output = tips.head()
sql 语句: SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;。

output = tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
total_bill tip smoker time
0 16.99 1.01 No Dinner
1 10.34 1.66 No Dinner
2 21.01 3.50 No Dinner
3 23.68 3.31 No Dinner
4 24.59 3.61 No Dinner
sql 语句: SELECT * FROM tips WHERE time = 'Dinner' LIMIT 5;

output = tips[tips['time'] == 'Dinner'].head(5)
# 或者
output = tips.query("time == 'Dinner'").head(5)

sql 语句:SELECT * FROM tips WHERE time = 'Dinner';。

output = tips[(tips['time'] == 'Dinner')]
sql 语句:SELECT * FROM tips WHERE time = 'Dinner' AND tip > 5.00;

output = tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
sql 语句:SELECT * FROM tips WHERE size >= 5 OR total_bill > 45;。

output = tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
sql 语句:SELECT * FROM tips WHERE siez in (5, 6);。

output = tips[tips['size'].isin([2, 5])]
sql 语句:SELECT sex, count(*) FROM tips GROUP BY sex;
https://learnku.com/articles/29825#replies
output = tips.groupby('sex').size()
转数组
>>> tips.total_bill.head().tolist()
[16.99, 10.34, 21.01, 23.68, 24.59]
>>> tips.columns.tolist()
['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

在 Python 里用 is 进行比较

1
2
3
4
5
6
7
8
9
10
11
12
13
14
if appointment.time_slot_id is time_slot.id:
time_slot_appointments.append(appointment)
出问题的地方就是这个 is 啦。但最诡异的是,当 time_slot_id 和 time_slot.id 都是小于等于 256 的整型数字时,这段代码一点问题都没有;只有当预约数量达到一定程度,使得 time_slot_id 或 time_slot.id 大于 256 时,问题就出现了——这个表达式永远返回 False

使用 is 关键字进行的比较是“引用比较”。这里的“引用”就相当于一个索引号,一个地址,或是指向一个对象的指针。用 is 进行比较正是造成这个奇怪 bug 的根源。
使用 == 操作符进行的比较是“值比较”,也就是比较两个对象的“值”。
在 Python 中,数值型的整型数据是以 PyObject 对象的一个子类型: PyLong 对象的形式存储的。为了减少内存管理在处理小整型数字时候的开销,在 CPython 解释器中使用了“小整数对象池”进行优化。也就是说,值为 -5256 的 PyLong 对象已经预置在 CPython 解释器的私有堆中,可以通过 small_ints 这个数组进行访问。
要想修复这个 bug,其中一种方式是,把:

if appointment.time_slot_id is time_slot.id:
改成:

if appointment.time_slot.id is time_slot.id:
只有当你十分确定要比较的是两个对象本身的时候,才用 is 进行比较。https://oicebot.github.io/2019/07/11/the-dangers-of-using-is-in-python.html

判断括号字符串

1
2
3
4
5
6
7
8
9
10
def isValid(self, s):
stack = []
paren_map = {')':'(', ']':'[', '}':'{'}

for c in s:
if c not in paren_map:
stack.apend(c)
elif not stack or paren[c] != stack.pop():
return False
return not stack

Could not fetch URL https://pypi.org/simple/pip/: There was a problem confirming the ssl certificate

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
pip install iredis
WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
Could not fetch URL https://pypi.org/simple/iredis/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/iredis/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))) - skipping
ERROR: Could not find a version that satisfies the requirement iredis (from versions: none)
ERROR: No matching distribution found for iredis
Could not fetch URL https://pypi.org/simple/pip/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/pip/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))) - skipping
到https://pypi.python.org/pypi/pip#downloads下载 https://files.pythonhosted.org/packages/ce/ea/9b445176a65ae4ba22dce1d93e4b5fe182f953df71a145f557cffaffc1bf/pip-19.3.1.tar.gz

解压出setup.py 执行python setup.py install
Installing pip.exe script to D:\python\Scripts
Installing pip.exe.manifest script to D:\python\Scripts
Installing pip3-script.py script to D:\python\Scripts
Installing pip3.exe script to D:\python\Scripts
Installing pip3.exe.manifest script to D:\python\Scripts
Installing pip3.7-script.py script to D:\python\Scripts
Installing pip3.7.exe script to D:\python\Scripts
Installing pip3.7.exe.manifest script to D:\python\Scripts

Installed d:\python\lib\site-packages\pip-19.3.1-py3.7.egg
Processing dependencies for pip==19.3.1
Finished processing dependencies for pip==19.3.1


pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org iredis
Collecting iredis
Downloading https://files.pythonhosted.org/packages/5c/1f/2da6df9c698a586f66bbb4153b7b2a75c62ce1e94aaf04ffaed1954163ad/iredis-0.7.0-py3-none-any.whl (42kB)
|████████████████████████████████| 51kB 363kB/s
Collecting click8<9,>=8

单行代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 字典推导
{v: k for k, v in some_dict.items()}
# 集合推导
{x**2 for x in [1, 1, 2]}
# 列表推导
[i for i in range(30) if i % 3 is 0]
a_list = [[1, 2], [3, 4], [5, 6]]
print(list(itertools.chain.from_iterable(a_list)))
# Output: [1, 2, 3, 4, 5, 6]
sum(a_list,[])
[1, 2, 3, 4, 5, 6]
# or
print(list(itertools.chain(*a_list)))
# Output: [1, 2, 3, 4, 5, 6]
python -c "import csv,json;print json.dumps(list(csv.reader(open('csv_file.csv'))))"
python -m cProfile my_script.py
cat file.json | python -m json.tool
https://learnku.com/articles/39048#reply125307

python算出了同事的身份证号码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
//https://mp.weixin.qq.com/s?__biz=MzU5MjI3NzIxMw==&mid=2247486816&idx=1&sn=baa976db515e3b9b99e7001daa9a577a&chksm=fe2376d2c954ffc486625e5420e3ebcf3d83581986b0568b804fb5a54e4aaa032b4992c13905&mpshare=1&scene=1&srcid=1023PX0DRWmDc5E8oEZSVUx6&sharer_sharetime=1571795782903&sharer_shareid=43165518fc08bc947dca48788293333a&key=6f23511bf9e1c01f4c78d4f8f46e1b1e8fc6e548405a6029e3b015de7441c1527cd4817fc238470a3211f36f03178e6f7f9888d5f7d1ee5e6ef6b0b0fced5da2f45aa739e184ae5749a86f5102efd4f9&ascene=1&uin=NjQ3OTQwMTAy&devicetype=Windows+7&version=62070152&lang=zh_CN&pass_ticket=Nl73k%2FpmXYhrLnAbsjSStmagh1FEZZkB8fhtyVf9%2BmzY8foNNpPw%2FmaVHa2zPKdu

用python生成1993年的所有日期吧



import time

#生成出生当年所有日期
def dateRange(year):
fmt = '%Y-%m-%d'
bgn = int(time.mktime(time.strptime(year+'-01-01',fmt)))
end = int(time.mktime(time.strptime(year+'-12-31',fmt)))
list_date = [time.strftime(fmt,time.localtime(i)) for i in range(bgn,end+1,3600*24)]
return [i.replace('-','') for i in list_date]

data_time = dateRange('1993')
pip install id-validator

from id_validator import validator

#遍历所有日期,print通过校验的身份证号码

def vali_dator(id1,id2,id3):
for i in dateRange(id2):
theid = id1 + i + id3
if validator.is_valid(theid):
print(theid)

vali_dator('330221','1993','4914')
打开12306官网,



12306添加常用联系人,



将李大伟+身份证号依次输入。



若身份证和姓名一致,就会显示校验通过;



若不能通过,则说明身份证和姓名不一致。 https://github.com/zpw1995/aotodata/tree/master/interest/ID_card

b站弹幕

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from bs4 import BeautifulSoup
import pandas as pd
import requests
#https://github.com/zpw1995/aotodata/blob/master/bilibili_danmu/B%E7%AB%99%E5%BC%B9%E5%B9%95%E7%88%AC%E8%99%AB.py
url = 'http://comment.bilibili.com/123519261.xml'
html = requests.get(url)
html.encoding='utf8'

soup = BeautifulSoup(html.text, 'lxml')
results = soup.find_all('d')

comments = [comment.text for comment in results]
comments_dict = {'comments': comments}

df = pd.DataFrame(comments_dict)
df.to_csv('bili_ai5.csv', encoding='utf-8-sig')

Pandas

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
 import pandas as pd
#读取csv
df = pd.read_csv('xxx.csv')

#pkl格式https://learnku.com/articles/39739
df.to_pickle('xxx.pkl') #格式另存
df = pd.read_pickle('xxx.pkl') #读取

#hdf格式
df.to_hdf('xxx.hdf','df') #格式另存
df = pd.read_hdf('xxx.pkl','df') #读取

boolean=[True,False]
gender=["男","女"]
color=["white","black","yellow"]
data=pd.DataFrame({
"height":np.random.randint(150,190,100),
"weight":np.random.randint(40,90,100),
"smoker":[boolean[x] for x in np.random.randint(0,2,100)],
"gender":[gender[x] for x in np.random.randint(0,2,100)],
"age":np.random.randint(15,90,100),
"color":[color[x] for x in np.random.randint(0,len(color),100) ]
}
)
>>> data
height weight smoker gender age color
0 186 77 False 女 59 black
1 162 62 False 女 75 yellow
2 187 78 False 男 66 black
3 166 45 True 男 38 white
#①使用字典进行映射
data["gender"] = data["gender"].map({"男":1, "女":0})

#②使用函数
def gender_map(x):
gender = 1 if x == "男" else 0
return gender
#注意这里传入的是函数名,不带括号
data["gender"] = data["gender"].map(gender_map)
def apply_age(x,bias):
return x+bias

#以元组的方式传入额外的参数
data["age"] = data["age"].apply(apply_age,args=(-3,))
# 沿着0轴求和
data[["height","weight","age"]].apply(np.sum, axis=0)

# 沿着0轴取对数
data[["height","weight","age"]].apply(np.log, axis=0)
def BMI(series):
weight = series["weight"]
height = series["height"]/100
BMI = weight/height**2
return BMI

data["BMI"] = data.apply(BMI,axis=1)
df.applymap(lambda x:"%.2f" % x)
https://learnku.com/articles/39734

In [12]: data.groupby("company").agg('mean')
Out[12]:
salary age
company
A 21.50 27.50
B 13.00 29.00
C 29.25 27.25
In [17]: data.groupby('company').agg({'salary':'median','age':'mean'})
Out[17]:
salary age
company
A 21.5 27.50
B 10.0 29.00
C 30.0 27.25

In [24]: data['avg_salary'] = data.groupby('company')['salary'].transform('mean')
https://learnku.com/articles/39735

requests 抓取网页的通用框架

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# -*- coding: utf-8 -*- 
import requests

def getHtmlText(url):
try:
response = requests.get(url)
# 如果状态码不是 200, 则应发 HTTPERROR 异常
response.raise_for_status()
# 设置正确的编码方式
response.encoding = response.apparent_encoding
return response.text
except:
return "Something Wrong!"

url = 'http://www.baidu.com'

result = getHtmlText(url)
print(result)
http://www.siya89.com/blog/python%20zero

Could not fetch URL

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

pip install -U requests
WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
Could not fetch URL https://pypi.org/simple/requests/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/requests/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))) - skipping
Requirement already up-to-date: requests in d:\python\lib\site-packages (2.21.0)
Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in d:\python\lib\site-packages (from requests) (2.8)

pip --trusted-host pypi.doubanio.com install -U tqdm -i http://pypi.doubanio.com/simple
Looking in indexes: http://pypi.doubanio.com/simple
Collecting tqdm
Downloading http://pypi.doubanio.com/packages/4a/1c/6359be64e8301b84160f6f6f7936bbfaaa5e9a4eab6cbc681db07600b949/tqdm-4.45.0-py2.py3-none-any.whl (60kB)
|████████████████████████████████| 61kB 1.9MB/s
Installing collected packages: tqdm
Found existing installation: tqdm 4.28.1
Uninstalling tqdm-4.28.1:
Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.45.0

创建或修改配置文件(linux的文件在~/.pip/pip.conf,windows在%HOMEPATH%\pip\pip.ini),修改内容为:

code:

[global]

index-url = http://pypi.douban.com/simple

Python Requests throwing SSLError

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import urllib3

urllib3.disable_warnings()
pip install certifi


>>> import requests
>>> requests.certs.where()
'D:\\python\\lib\\site-packages\\certifi\\cacert.pem'
requests.get(url, verify=False)
cafile = 'cacert.pem' # http://curl.haxx.se/ca/cacert.pem
r = requests.get(url, verify=cafile)

requests.get("https://api.github.com/events", verify=True, cert=['/path/to/my/ca.crt'])

https://stackoverflow.com/questions/10667960/python-requests-throwing-sslerror
https://requests.readthedocs.io/en/latest/user/advanced/#ssl-cert-verification

Python 数据分析实战 | 用数据带你回顾乔丹的职业生涯

Python 包的创建

Flask 和 ChatterBot 构建聊天机器人

小项目来学习 Flask

Flask 项目结构分享

纯前端实现人脸识别自动佩戴圣诞帽

python flask 微信机器人

中国大学MOOC、学堂在线、网易云课堂、好大学在线、爱课程 MOOC 课程下载。

100个Numpy练习

Python 图文识别

学Python前请你先看看这个文章!

Python进阶书籍

新闻类网站通用抽取器 https://github.com/kingname/GeneralNewsExtractor

用Python把公众号文章打包成pdf文件

用 Python 爬取 2018 前端热点

30分钟带你入门数据分析工具 Pandas

PyCharm2019激活

wkhtmlpdf命令行工具发送微信图片告警

Python解密进相亲交流群

OpenCV和FFmpeg将普通视频转成代码视频

python3文档

从零开始的 TensorFlow tensorflow.google.cn/

Redis 结合 flask 及 vue 实现 SSE 在线聊天

Python社区文章

python版-批量中文文件名转英文

模拟登录一些知名的网站,为了方便爬取需要登录的网站fuck login

pandas

Python 快速打包发布软件PyPi上

Python进阶

Python 进程管理工具 Supervisor 使用教程

Python 入门教程 - 课程大纲

网易云音乐歌曲评论爬虫

爬取豆瓣短评之《后来的我们》

python奇技淫巧

爆破的方式收集子域名https://5alt.me/tools/

python模拟登陆一些大型网站,还有一些简单的爬虫

python 爬虫入门初级篇 https://piaosanlang.gitbooks.io/spiders/content/01day/README1.html

python 坦克大战源码 https://learnku.com/articles/34725

pycharm激活码 过期

python教程

Python 反编译调用有道翻译

PDF 和图片互转

pandas 学习笔记